Re: [RFC PATCH v2 01/10] net: sched: allow qdiscs to handle locking

2016-07-13 Thread John Fastabend
On 16-07-13 11:19 PM, John Fastabend wrote:
> This patch adds a flag for queueing disciplines to indicate the stack
> does not need to use the qdisc lock to protect operations. This can
> be used to build lockless scheduling algorithms and improving
> performance.
> 
> The flag is checked in the tx path and the qdisc lock is only taken
> if it is not set. For now use a conditional if statement. Later we
> could be more aggressive if it proves worthwhile and use a static key
> or wrap this in a likely().
> 
> Signed-off-by: John Fastabend 
> ---

[...]

> @@ -3075,6 +3075,27 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, 
> struct Qdisc *q,
>   int rc;
>  
>   qdisc_calculate_pkt_len(skb, q);
> +
> + if (q->flags & TCQ_F_NOLOCK) {
> + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
> + __qdisc_drop(skb, &to_free);
> + rc = NET_XMIT_DROP;
> + } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q)) {
> + qdisc_bstats_cpu_update(q, skb);
> + __qdisc_run(q);

Reviewing these patches now and noticed this qdisc_run() is not
needed.

> + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
> + __qdisc_run(q);
> + rc = NET_XMIT_SUCCESS;
> + } else {
> + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
> + __qdisc_run(q);
> + }
> +
> + if (unlikely(to_free))
> + kfree_skb_list(to_free);
> + return rc;
> + }
> +

[...]

Thanks,
John



Re: [PATCH] mlxsw: spectrum_router: Return -ENOENT in case of error

2016-07-13 Thread Jiri Pirko
Thu, Jul 14, 2016 at 08:18:45AM CEST, christophe.jail...@wanadoo.fr wrote:
>'vr' should be a valid pointer here, so returning 'PTR_ERR(vr)' is wrong.
>Return an explicit error code (-ENOENT) instead.
>

This is fo net-next.

Fixes: 61c503f976 ("mlxsw: spectrum_router: Implement fib4 add/del switchdev 
obj ops")

>Signed-off-by: Christophe JAILLET 

Acked-by: Jiri Pirko 


Thanks.


[RFC PATCH v2 10/10] net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq

2016-07-13 Thread John Fastabend
The sch_mq qdisc creates a sub-qdisc per tx queue which are then
called independently for enqueue and dequeue operations. However
statistics are aggregated and pushed up to the "master" qdisc.

This patch adds support for any of the sub-qdiscs to be per cpu
statistic qdiscs. To handle this case add a check when calculating
stats and aggregate the per cpu stats if needed.

Also exports __gnet_stats_copy_queue() to use as a helper function.

Signed-off-by: John Fastabend 
---
 include/net/gen_stats.h |3 +++
 net/core/gen_stats.c|9 +
 net/sched/sch_mq.c  |   25 ++---
 3 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 231e121..5ddc88b 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -47,6 +47,9 @@ int gnet_stats_copy_rate_est(struct gnet_dump *d,
 int gnet_stats_copy_queue(struct gnet_dump *d,
  struct gnet_stats_queue __percpu *cpu_q,
  struct gnet_stats_queue *q, __u32 qlen);
+void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
+const struct gnet_stats_queue __percpu *cpu_q,
+const struct gnet_stats_queue *q, __u32 qlen);
 int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len);
 
 int gnet_stats_finish_copy(struct gnet_dump *d);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 508e051..a503547 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -254,10 +254,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue 
*qstats,
}
 }
 
-static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
-   const struct gnet_stats_queue __percpu *cpu,
-   const struct gnet_stats_queue *q,
-   __u32 qlen)
+void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
+const struct gnet_stats_queue __percpu *cpu,
+const struct gnet_stats_queue *q,
+__u32 qlen)
 {
if (cpu) {
__gnet_stats_copy_queue_cpu(qstats, cpu);
@@ -271,6 +271,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue 
*qstats,
 
qstats->qlen = qlen;
 }
+EXPORT_SYMBOL(__gnet_stats_copy_queue);
 
 /**
  * gnet_stats_copy_queue - copy queue statistics into statistics TLV
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index b943982..f4b5bbb 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct mq_sched {
struct Qdisc**qdiscs;
@@ -107,15 +108,25 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
memset(&sch->qstats, 0, sizeof(sch->qstats));
 
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+   struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
+   struct gnet_stats_queue __percpu *cpu_qstats = NULL;
+   __u32 qlen = 0;
+
qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping;
spin_lock_bh(qdisc_lock(qdisc));
-   sch->q.qlen += qdisc->q.qlen;
-   sch->bstats.bytes   += qdisc->bstats.bytes;
-   sch->bstats.packets += qdisc->bstats.packets;
-   sch->qstats.backlog += qdisc->qstats.backlog;
-   sch->qstats.drops   += qdisc->qstats.drops;
-   sch->qstats.requeues+= qdisc->qstats.requeues;
-   sch->qstats.overlimits  += qdisc->qstats.overlimits;
+
+   if (qdisc_is_percpu_stats(qdisc)) {
+   cpu_bstats = qdisc->cpu_bstats;
+   cpu_qstats = qdisc->cpu_qstats;
+   }
+
+   qlen = qdisc_qlen_sum(qdisc);
+
+   __gnet_stats_copy_basic(NULL, &sch->bstats,
+   cpu_bstats, &qdisc->bstats);
+   __gnet_stats_copy_queue(&sch->qstats,
+   cpu_qstats, &qdisc->qstats, qlen);
+
spin_unlock_bh(qdisc_lock(qdisc));
}
return 0;



[RFC PATCH v2 09/10] net: sched: helper to sum qlen

2016-07-13 Thread John Fastabend
Reporting qlen when qlen is per cpu requires aggregating the per
cpu counters. This adds a helper routine for this.

Signed-off-by: John Fastabend 
---
 include/net/sch_generic.h |   15 +++
 1 file changed, 15 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 149f079..d370fee 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -271,6 +271,21 @@ static inline int qdisc_qlen(const struct Qdisc *q)
return q->q.qlen;
 }
 
+static inline int qdisc_qlen_sum(const struct Qdisc *q)
+{
+   __u32 qlen = 0;
+   int i;
+
+   if (q->flags & TCQ_F_NOLOCK) {
+   for_each_possible_cpu(i)
+   qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
+   } else {
+   qlen = q->q.qlen;
+   }
+
+   return qlen;
+}
+
 static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb)
 {
return (struct qdisc_skb_cb *)skb->cb;



[RFC PATCH v2 08/10] net: sched: pfifo_fast use alf_queue

2016-07-13 Thread John Fastabend
This converts the pfifo_fast qdisc to use the alf_queue enqueue and
dequeue routines then sets the NOLOCK bit.

This also removes the logic used to pick the next band to dequeue from
and instead just checks each alf_queue for packets from top priority
to lowest. This might need to be a bit more clever but seems to work
for now.

Signed-off-by: John Fastabend 
---
 net/sched/sch_generic.c |  131 +++
 1 file changed, 75 insertions(+), 56 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 7dcd066..2ac3eb9 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -555,88 +556,79 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = {
 
 /*
  * Private data for a pfifo_fast scheduler containing:
- * - queues for the three band
- * - bitmap indicating which of the bands contain skbs
+ * - rings for priority bands
  */
 struct pfifo_fast_priv {
-   u32 bitmap;
-   struct sk_buff_head q[PFIFO_FAST_BANDS];
+   struct skb_array q[PFIFO_FAST_BANDS];
 };
 
-/*
- * Convert a bitmap to the first band number where an skb is queued, where:
- * bitmap=0 means there are no skbs on any band.
- * bitmap=1 means there is an skb on band 0.
- * bitmap=7 means there are skbs on all 3 bands, etc.
- */
-static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};
-
-static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
-int band)
+static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
+ int band)
 {
-   return priv->q + band;
+   return &priv->q[band];
 }
 
 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
  struct sk_buff **to_free)
 {
-   if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
-   int band = prio2band[skb->priority & TC_PRIO_MAX];
-   struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-   struct sk_buff_head *list = band2list(priv, band);
-
-   priv->bitmap |= (1 << band);
-   qdisc->q.qlen++;
-   return __qdisc_enqueue_tail(skb, qdisc, list);
-   }
+   int band = prio2band[skb->priority & TC_PRIO_MAX];
+   struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
+   struct skb_array *q = band2list(priv, band);
+   int err;
 
-   return qdisc_drop(skb, qdisc, to_free);
+   err = skb_array_produce_bh(q, skb);
+
+   if (unlikely(err))
+   return qdisc_drop_cpu(skb, qdisc, to_free);
+
+   qdisc_qstats_cpu_qlen_inc(qdisc);
+   qdisc_qstats_cpu_backlog_inc(qdisc, skb);
+   return NET_XMIT_SUCCESS;
 }
 
 static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
 {
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-   int band = bitmap2band[priv->bitmap];
+   struct sk_buff *skb = NULL;
+   int band;
 
-   if (likely(band >= 0)) {
-   struct sk_buff_head *list = band2list(priv, band);
-   struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);
+   for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
+   struct skb_array *q = band2list(priv, band);
 
-   qdisc->q.qlen--;
-   if (skb_queue_empty(list))
-   priv->bitmap &= ~(1 << band);
+   if (__skb_array_empty(q))
+   continue;
 
-   return skb;
+   skb = skb_array_consume_bh(q);
}
 
-   return NULL;
-}
-
-static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
-{
-   struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
-   int band = bitmap2band[priv->bitmap];
-
-   if (band >= 0) {
-   struct sk_buff_head *list = band2list(priv, band);
-
-   return skb_peek(list);
+   if (likely(skb)) {
+   qdisc_qstats_cpu_backlog_dec(qdisc, skb);
+   qdisc_bstats_cpu_update(qdisc, skb);
+   qdisc_qstats_cpu_qlen_dec(qdisc);
}
 
-   return NULL;
+   return skb;
 }
 
 static void pfifo_fast_reset(struct Qdisc *qdisc)
 {
-   int prio;
+   int i, band;
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 
-   for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
-   __qdisc_reset_queue(band2list(priv, prio));
+   for (band = 0; band < PFIFO_FAST_BANDS; band++) {
+   struct skb_array *q = band2list(priv, band);
+   struct sk_buff *skb;
 
-   priv->bitmap = 0;
-   qdisc->qstats.backlog = 0;
-   qdisc->q.qlen = 0;
+   while ((skb = skb_array_consume_bh(q)) != NULL)
+   kfree_skb(skb);
+   }
+
+   for_each_possible_cpu(i) {
+   struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, 

[RFC PATCH v2 07/10] net: sched: support skb_bad_tx with lockless qdisc

2016-07-13 Thread John Fastabend
Similar to how gso is handled skb_bad_tx needs to be per cpu to handle
lockless qdisc with multiple writer/producers.

Signed-off-by: John Fastabend 
---
 include/net/sch_generic.h |7 +++
 net/sched/sch_api.c   |5 ++
 net/sched/sch_generic.c   |   94 +
 3 files changed, 97 insertions(+), 9 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 7b140e2..149f079 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -40,6 +40,10 @@ struct gso_cell {
struct sk_buff *skb;
 };
 
+struct bad_txq_cell {
+   struct sk_buff *skb;
+};
+
 struct Qdisc {
int (*enqueue)(struct sk_buff *skb,
   struct Qdisc *sch,
@@ -77,7 +81,8 @@ struct Qdisc {
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
struct gnet_stats_queue __percpu *cpu_qstats;
 
-   struct gso_cell __percpu *gso_cpu_skb;
+   struct gso_cell __percpu *gso_cpu_skb;
+   struct bad_txq_cell __percpu *skb_bad_txq_cpu;
 
/*
 * For performance sake on SMP, we put highly modified fields at the end
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d713052..50088e2 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -970,6 +970,10 @@ qdisc_create(struct net_device *dev, struct netdev_queue 
*dev_queue,
sch->gso_cpu_skb = alloc_percpu(struct gso_cell);
if (!sch->gso_cpu_skb)
goto err_out4;
+
+   sch->skb_bad_txq_cpu = alloc_percpu(struct 
bad_txq_cell);
+   if (!sch->skb_bad_txq_cpu)
+   goto err_out4;
}
 
if (tca[TCA_STAB]) {
@@ -1021,6 +1025,7 @@ err_out4:
free_percpu(sch->cpu_bstats);
free_percpu(sch->cpu_qstats);
free_percpu(sch->gso_cpu_skb);
+   free_percpu(sch->skb_bad_txq_cpu);
/*
 * Any broken qdiscs that would require a ops->reset() here?
 * The qdisc was never in action so it shouldn't be necessary.
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 8a665dc..7dcd066 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -44,6 +44,42 @@ EXPORT_SYMBOL(default_qdisc_ops);
  * - ingress filtering is also serialized via qdisc root lock
  * - updates to tree and tree walking are only done under the rtnl mutex.
  */
+static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *sch)
+{
+   if (sch->skb_bad_txq_cpu) {
+   struct bad_txq_cell *cell = this_cpu_ptr(sch->skb_bad_txq_cpu);
+
+   return cell->skb;
+   }
+
+   return sch->skb_bad_txq;
+}
+
+static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *sch,
+struct sk_buff *skb)
+{
+   if (sch->skb_bad_txq_cpu) {
+   struct bad_txq_cell *cell = this_cpu_ptr(sch->skb_bad_txq_cpu);
+
+   cell->skb = skb;
+   return;
+   }
+
+   sch->skb_bad_txq = skb;
+}
+
+static inline void qdisc_null_skb_bad_txq(struct Qdisc *sch)
+{
+   if (sch->skb_bad_txq_cpu) {
+   struct bad_txq_cell *cell = this_cpu_ptr(sch->skb_bad_txq_cpu);
+
+   cell->skb = NULL;
+   return;
+   }
+
+   sch->skb_bad_txq = NULL;
+}
+
 static inline struct sk_buff *qdisc_dequeue_gso_skb(struct Qdisc *sch)
 {
if (sch->gso_cpu_skb)
@@ -129,9 +165,15 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
if (!nskb)
break;
if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
-   q->skb_bad_txq = nskb;
-   qdisc_qstats_backlog_inc(q, nskb);
-   q->q.qlen++;
+   qdisc_enqueue_skb_bad_txq(q, nskb);
+
+   if (qdisc_is_percpu_stats(q)) {
+   qdisc_qstats_cpu_backlog_inc(q, nskb);
+   qdisc_qstats_cpu_qlen_inc(q);
+   } else {
+   qdisc_qstats_backlog_inc(q, nskb);
+   q->q.qlen++;
+   }
break;
}
skb->next = nskb;
@@ -160,7 +202,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool 
*validate,
qdisc_null_gso_skb(q);
 
if (qdisc_is_percpu_stats(q)) {
-   qdisc_qstats_cpu_backlog_inc(q, skb);
+   qdisc_qstats_cpu_backlog_dec(q, skb);
qdisc_qstats_cpu_qlen_dec(q);
} else {
qdisc_qstats_backlog_dec(q, skb);
@@ -171,14 +213,19 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool 
*validate,
  

[RFC PATCH v2 06/10] net: sched: support qdisc_reset on NOLOCK qdisc

2016-07-13 Thread John Fastabend
The qdisc_reset operation depends on the qdisc lock at the moment
to halt any additions to gso_skb and statistics while the list is
free'd and the stats zeroed.

Without the qdisc lock we can not guarantee another cpu is not in
the process of adding a skb to one of the "cells". Here are the
two cases we have to handle.

 case 1: qdisc_graft operation. In this case a "new" qdisc is attached
 and the 'qdisc_destroy' operation is called on the old qdisc.
 The destroy operation will wait a rcu grace period and call
 qdisc_rcu_free(). At which point gso_cpu_skb is free'd along
 with all stats so no need to zero stats and gso_cpu_skb from
 the reset operation itself.

 Because we can not continue to call qdisc_reset before waiting
 an rcu grace period so that the qdisc is detached from all
 cpus simply do not call qdisc_reset() at all and let the
 qdisc_destroy operation clean up the qdisc. Note, a refcnt
 greater than 1 would cause the destroy operation to be
 aborted however if this ever happened the reference to the
 qdisc would be lost and we would have a memory leak.

 case 2: dev_deactivate sequence. This can come from a user bringing
 the interface down which causes the gso_skb list to be flushed
 and the qlen zero'd. At the moment this is protected by the
 qdisc lock so while we clear the qlen/gso_skb fields we are
 guaranteed no new skbs are added. For the lockless case
 though this is not true. To resolve this move the qdisc_reset
 call after the new qdisc is assigned and a grace period is
 exercised to ensure no new skbs can be enqueued. Further
 the RTNL lock is held so we can not get another call to
 activate the qdisc while the skb lists are being free'd.

 Finally, fix qdisc_reset to handle the per cpu stats and
 skb lists.

Signed-off-by: John Fastabend 
---
 net/sched/sch_generic.c |   45 +++--
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index f903093..8a665dc 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -739,6 +739,20 @@ void qdisc_reset(struct Qdisc *qdisc)
kfree_skb(qdisc->skb_bad_txq);
qdisc->skb_bad_txq = NULL;
 
+   if (qdisc->gso_cpu_skb) {
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct gso_cell *cell;
+
+   cell = per_cpu_ptr(qdisc->gso_cpu_skb, i);
+   if (cell) {
+   kfree_skb_list(cell->skb);
+   cell = NULL;
+   }
+   }
+   }
+
if (qdisc->gso_skb) {
kfree_skb_list(qdisc->gso_skb);
qdisc->gso_skb = NULL;
@@ -814,10 +828,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue 
*dev_queue,
root_lock = qdisc_lock(oqdisc);
spin_lock_bh(root_lock);
 
-   /* Prune old scheduler */
-   if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
-   qdisc_reset(oqdisc);
-
/* ... and graft new one */
if (qdisc == NULL)
qdisc = &noop_qdisc;
@@ -931,7 +941,6 @@ static void dev_deactivate_queue(struct net_device *dev,
set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
 
rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
-   qdisc_reset(qdisc);
 
spin_unlock_bh(qdisc_lock(qdisc));
}
@@ -968,6 +977,16 @@ static bool some_qdisc_is_busy(struct net_device *dev)
return false;
 }
 
+static void dev_qdisc_reset(struct net_device *dev,
+   struct netdev_queue *dev_queue,
+   void *none)
+{
+   struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
+
+   if (qdisc)
+   qdisc_reset(qdisc);
+}
+
 /**
  * dev_deactivate_many - deactivate transmissions on several devices
  * @head: list of devices to deactivate
@@ -978,7 +997,6 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 void dev_deactivate_many(struct list_head *head)
 {
struct net_device *dev;
-   bool sync_needed = false;
 
list_for_each_entry(dev, head, close_list) {
netdev_for_each_tx_queue(dev, dev_deactivate_queue,
@@ -988,20 +1006,27 @@ void dev_deactivate_many(struct list_head *head)
 &noop_qdisc);
 
dev_watchdog_down(dev);
-   sync_needed |= !dev->dismantle;
}
 
/* Wait for outstanding qdisc-less dev_queue_xmit calls.
 * This is avoided if all devices are in dismantle phase :
 * Caller will call synchronize_net() for us
 */
-   if (sync_needed)
-   synchronize_net();
+   synchronize_net();
 
/* Wait fo

[RFC PATCH v2 05/10] net: sched: per cpu gso handlers

2016-07-13 Thread John Fastabend
The net sched infrastructure has a gso ptr that points to skb structs
that have failed to be enqueued by the device driver.

This can happen when multiple cores try to push a skb onto the same
underlying hardware queue resulting in lock contention. This case is
handled by a cpu collision handler handle_dev_cpu_collision(). Another
case occurs when the stack overruns the drivers low level tx queues
capacity. Ideally these should be a rare occurrence in a well-tuned
system but they do happen.

To handle this in the lockless case use a per cpu gso field to park
the skb until the conflict can be resolved. Note at this point the
skb has already been popped off the qdisc so it has to be handled
by the infrastructure.

Signed-off-by: John Fastabend 
---
 include/net/sch_generic.h |   37 +++
 net/sched/sch_api.c   |7 
 net/sched/sch_generic.c   |   71 ++---
 3 files changed, 110 insertions(+), 5 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f69da4b..7b140e2 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -36,6 +36,10 @@ struct qdisc_size_table {
u16 data[];
 };
 
+struct gso_cell {
+   struct sk_buff *skb;
+};
+
 struct Qdisc {
int (*enqueue)(struct sk_buff *skb,
   struct Qdisc *sch,
@@ -73,6 +77,8 @@ struct Qdisc {
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
struct gnet_stats_queue __percpu *cpu_qstats;
 
+   struct gso_cell __percpu *gso_cpu_skb;
+
/*
 * For performance sake on SMP, we put highly modified fields at the end
 */
@@ -725,6 +731,22 @@ static inline struct sk_buff *qdisc_peek_dequeued(struct 
Qdisc *sch)
return sch->gso_skb;
 }
 
+static inline struct sk_buff *qdisc_peek_dequeued_cpu(struct Qdisc *sch)
+{
+   struct gso_cell *gso = this_cpu_ptr(sch->gso_cpu_skb);
+
+   if (!gso->skb) {
+   struct sk_buff *skb = sch->dequeue(sch);
+
+   if (skb) {
+   gso->skb = skb;
+   qdisc_qstats_cpu_qlen_inc(sch);
+   }
+   }
+
+   return gso->skb;
+}
+
 /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */
 static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
 {
@@ -741,6 +763,21 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct 
Qdisc *sch)
return skb;
 }
 
+static inline struct sk_buff *qdisc_dequeue_peeked_skb(struct Qdisc *sch)
+{
+   struct gso_cell *gso = this_cpu_ptr(sch->gso_cpu_skb);
+   struct sk_buff *skb = gso->skb;
+
+   if (skb) {
+   gso->skb = NULL;
+   qdisc_qstats_cpu_qlen_dec(sch);
+   } else {
+   skb = sch->dequeue(sch);
+   }
+
+   return skb;
+}
+
 static inline void __qdisc_reset_queue(struct sk_buff_head *list)
 {
/*
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 12ebde8..d713052 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -966,6 +966,12 @@ qdisc_create(struct net_device *dev, struct netdev_queue 
*dev_queue,
goto err_out4;
}
 
+   if (sch->flags & TCQ_F_NOLOCK) {
+   sch->gso_cpu_skb = alloc_percpu(struct gso_cell);
+   if (!sch->gso_cpu_skb)
+   goto err_out4;
+   }
+
if (tca[TCA_STAB]) {
stab = qdisc_get_stab(tca[TCA_STAB]);
if (IS_ERR(stab)) {
@@ -1014,6 +1020,7 @@ err_out:
 err_out4:
free_percpu(sch->cpu_bstats);
free_percpu(sch->cpu_qstats);
+   free_percpu(sch->gso_cpu_skb);
/*
 * Any broken qdiscs that would require a ops->reset() here?
 * The qdisc was never in action so it shouldn't be necessary.
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index fc70204..f903093 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -44,8 +44,25 @@ EXPORT_SYMBOL(default_qdisc_ops);
  * - ingress filtering is also serialized via qdisc root lock
  * - updates to tree and tree walking are only done under the rtnl mutex.
  */
+static inline struct sk_buff *qdisc_dequeue_gso_skb(struct Qdisc *sch)
+{
+   if (sch->gso_cpu_skb)
+   return (this_cpu_ptr(sch->gso_cpu_skb))->skb;
 
-static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
+   return sch->gso_skb;
+}
+
+static inline void qdisc_null_gso_skb(struct Qdisc *sch)
+{
+   if (sch->gso_cpu_skb) {
+   (this_cpu_ptr(sch->gso_cpu_skb))->skb = NULL;
+   return;
+   }
+
+   sch->gso_skb = NULL;
+}
+
+static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
 {
q->gso_skb = skb;
q->qstats.requeues++;
@@ -56,6 +73,25 @@ static inline int 

[RFC PATCH v2 04/10] net: sched: a dflt qdisc may be used with per cpu stats

2016-07-13 Thread John Fastabend
Enable dflt qdisc support for per cpu stats before this patch a
dflt qdisc was required to use the global statistics qstats and
bstats.

Signed-off-by: John Fastabend 
---
 net/sched/sch_generic.c |   24 
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 2c3e23b..fc70204 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -647,18 +647,34 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue 
*dev_queue,
struct Qdisc *sch;
 
if (!try_module_get(ops->owner))
-   goto errout;
+   return NULL;
 
sch = qdisc_alloc(dev_queue, ops);
if (IS_ERR(sch))
-   goto errout;
+   return NULL;
sch->parent = parentid;
 
-   if (!ops->init || ops->init(sch, NULL) == 0)
+   if (!ops->init)
return sch;
 
-   qdisc_destroy(sch);
+   if (ops->init(sch, NULL))
+   goto errout;
+
+   /* init() may have set percpu flags so init data structures */
+   if (qdisc_is_percpu_stats(sch)) {
+   sch->cpu_bstats =
+   netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+   if (!sch->cpu_bstats)
+   goto errout;
+
+   sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
+   if (!sch->cpu_qstats)
+   goto errout;
+   }
+
+   return sch;
 errout:
+   qdisc_destroy(sch);
return NULL;
 }
 EXPORT_SYMBOL(qdisc_create_dflt);



[RFC PATCH v2 03/10] net: sched: provide per cpu qstat helpers

2016-07-13 Thread John Fastabend
The per cpu qstats support was added with per cpu bstat support which
is currently used by the ingress qdisc. This patch adds a set of
helpers needed to make other qdiscs that use qstats per cpu as well.

Signed-off-by: John Fastabend 
---
 include/net/sch_generic.h |   39 +++
 1 file changed, 39 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 354951d..f69da4b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -573,12 +573,43 @@ static inline void qdisc_qstats_backlog_dec(struct Qdisc 
*sch,
sch->qstats.backlog -= qdisc_pkt_len(skb);
 }
 
+static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch,
+   const struct sk_buff *skb)
+{
+   struct gnet_stats_queue *q = this_cpu_ptr(sch->cpu_qstats);
+
+   q->backlog -= qdisc_pkt_len(skb);
+}
+
 static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch,
const struct sk_buff *skb)
 {
sch->qstats.backlog += qdisc_pkt_len(skb);
 }
 
+static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch,
+   const struct sk_buff *skb)
+{
+   struct gnet_stats_queue *q = this_cpu_ptr(sch->cpu_qstats);
+
+   q->backlog += qdisc_pkt_len(skb);
+}
+
+static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch)
+{
+   this_cpu_ptr(sch->cpu_qstats)->qlen++;
+}
+
+static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch)
+{
+   this_cpu_ptr(sch->cpu_qstats)->qlen--;
+}
+
+static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch)
+{
+   this_cpu_ptr(sch->cpu_qstats)->requeues++;
+}
+
 static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count)
 {
sch->qstats.drops += count;
@@ -751,6 +782,14 @@ static inline void rtnl_qdisc_drop(struct sk_buff *skb, 
struct Qdisc *sch)
qdisc_qstats_drop(sch);
 }
 
+static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch,
+struct sk_buff **to_free)
+{
+   __qdisc_drop(skb, to_free);
+   qdisc_qstats_cpu_drop(sch);
+
+   return NET_XMIT_DROP;
+}
 
 static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
 struct sk_buff **to_free)



Re: [PATCH net-next 1/2] devlink: fix build error for CONFIG_MODULES=n

2016-07-13 Thread Jiri Pirko
Wed, Jul 13, 2016 at 11:03:37PM CEST, a...@arndb.de wrote:
>A driver calling trace_devlink_hwmsg cannot be built when modules are disabled:
>
>include/trace/events/devlink.h: In function 
>'trace_event_get_offsets_devlink_hwmsg':
>include/trace/events/devlink.h:25:51: error: dereferencing pointer to 
>incomplete type 'struct module'
>   __string(owner_name, devlink->dev->driver->owner->name)
>
>This changes the code to only print the module name when modules are actually
>enabled, otherwise we hardcode the string "built-in".
>
>Signed-off-by: Arnd Bergmann 
>Fixes: e5224f0fe2ac ("devlink: add hardware messages tracing facility")
>---
> include/trace/events/devlink.h | 8 
> 1 file changed, 8 insertions(+)
>
>diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h
>index 333c32ac9bfa..26f92d3c7e9c 100644
>--- a/include/trace/events/devlink.h
>+++ b/include/trace/events/devlink.h
>@@ -22,7 +22,11 @@ TRACE_EVENT(devlink_hwmsg,
>   TP_STRUCT__entry(
>   __string(bus_name, devlink->dev->bus->name)
>   __string(dev_name, dev_name(devlink->dev))
>+#ifdef CONFIG_MODULES
>   __string(owner_name, devlink->dev->driver->owner->name)

I think would be better to use driver->name. I looks like it is always
present. I will do some tests and send a patch.


>+#else
>+  __string(owner_name, "built-in")
>+#endif
>   __field(bool, incoming)
>   __field(unsigned long, type)
>   __dynamic_array(u8, buf, len)
>@@ -32,7 +36,11 @@ TRACE_EVENT(devlink_hwmsg,
>   TP_fast_assign(
>   __assign_str(bus_name, devlink->dev->bus->name);
>   __assign_str(dev_name, dev_name(devlink->dev));
>+#ifdef CONFIG_MODULES
>   __assign_str(owner_name, devlink->dev->driver->owner->name);
>+#else
>+  __assign_str(owner_name, "built-in");
>+#endif
>   __entry->incoming = incoming;
>   __entry->type = type;
>   memcpy(__get_dynamic_array(buf), buf, len);
>-- 
>2.9.0
>


[RFC PATCH v2 00/10] running qdiscs without qdisc_lock

2016-07-13 Thread John Fastabend
Hi,

I thought I should go ahead and send this series out for comments.
Here I allow qdiscs to be run without taking the qdisc lock. As a
result statistics, gso skb, tx bad skb and a few other things need
to be "safe" to run without locks. It _should_ all be covered here.
Although I just noticed I must be missing a dec on the backlog
counter somewhere as one of my tests just ended with 0packets but
a nonzero bytes counter.

Also of note in this series I used the skb_array implementation
already in net-next for the tun/tap devices. With this implementation
for cases where lots of threads are hitting the same qdisc I see
a modest improvement but for cases like mq with pktgen where
everything is lined up nicely I see a fairly unpleasant regression.

I have a few thoughts on how to resolve this. First if we support
bulk_dequeue as an operation on the skb_array this should help
vs getting the consumer lock repeatedly. Also we really don't need
the HARD_TX_LOCK if we have a core per queue and XPS setup like many
multiqueue nics default to. And I need to go back and look at the
original alf ring implementation as well to see how it compares I
don't recall seeing the mq regression there.

Also after the above it might be nice to make all qdiscs support
the per cpu statistics and drop non per cpu cases just to simplify
the code and all the if/else branching where its not needed.

As usual any thoughts, comments, etc are welcome.

And I wasn't going to add these numbers just because they come from
an untuned system but why not.

Here are some initial numbers from pktgen on my development which
is a reasonable system (E5-2695) but I didn't do any work to tweak
the config so there is still a bunch of debug/hacking options still
running.

The pktgen command is

./samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -i eth3  -t X -s 64

pfifo_fast

original ppslocklessdiff
1   1418168 1269450 -148718
2   1587390 1553408 -33982
4   1084961 1683639 +598678
8   989636  1522723 +533087
12  1014018 1348172 +334154

mq
   original ppslocklessdiff
1  1442018 1205180-236838 
2  2646069 2266095-379974
4  5136200 4269470-866730
8  
12 1327567110810909   -2464762

---

John Fastabend (10):
  net: sched: allow qdiscs to handle locking
  net: sched: qdisc_qlen for per cpu logic
  net: sched: provide per cpu qstat helpers
  net: sched: a dflt qdisc may be used with per cpu stats
  net: sched: per cpu gso handlers
  net: sched: support qdisc_reset on NOLOCK qdisc
  net: sched: support skb_bad_tx with lockless qdisc
  net: sched: pfifo_fast use alf_queue
  net: sched: helper to sum qlen
  net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq


 include/net/gen_stats.h   |3 
 include/net/sch_generic.h |  105 
 net/core/dev.c|   32 +++-
 net/core/gen_stats.c  |9 +
 net/sched/sch_api.c   |   12 +
 net/sched/sch_generic.c   |  385 +++--
 net/sched/sch_mq.c|   25 ++-
 7 files changed, 467 insertions(+), 104 deletions(-)

--


[RFC PATCH v2 02/10] net: sched: qdisc_qlen for per cpu logic

2016-07-13 Thread John Fastabend
This is a bit interesting because it means sch_direct_xmit will
return a positive value which causes the dequeue/xmit cycle to
continue only when a specific cpu has a qlen > 0.

However checking each cpu for qlen will break performance so
its important to note that qdiscs that set the no lock bit need
to have some sort of per cpu enqueue/dequeue data structure that
maps to the per cpu qlen value.

Signed-off-by: John Fastabend 
---
 include/net/sch_generic.h |8 
 1 file changed, 8 insertions(+)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 3de6a8c..354951d 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -247,8 +247,16 @@ static inline void qdisc_cb_private_validate(const struct 
sk_buff *skb, int sz)
BUILD_BUG_ON(sizeof(qcb->data) < sz);
 }
 
+static inline int qdisc_qlen_cpu(const struct Qdisc *q)
+{
+   return this_cpu_ptr(q->cpu_qstats)->qlen;
+}
+
 static inline int qdisc_qlen(const struct Qdisc *q)
 {
+   if (q->flags & TCQ_F_NOLOCK)
+   return qdisc_qlen_cpu(q);
+
return q->q.qlen;
 }
 



[RFC PATCH v2 01/10] net: sched: allow qdiscs to handle locking

2016-07-13 Thread John Fastabend
This patch adds a flag for queueing disciplines to indicate the stack
does not need to use the qdisc lock to protect operations. This can
be used to build lockless scheduling algorithms and improving
performance.

The flag is checked in the tx path and the qdisc lock is only taken
if it is not set. For now use a conditional if statement. Later we
could be more aggressive if it proves worthwhile and use a static key
or wrap this in a likely().

Signed-off-by: John Fastabend 
---
 include/net/sch_generic.h |1 +
 net/core/dev.c|   32 
 net/sched/sch_generic.c   |   24 
 3 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 909aff2..3de6a8c 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -58,6 +58,7 @@ struct Qdisc {
 #define TCQ_F_NOPARENT 0x40 /* root of its hierarchy :
  * qdisc_tree_decrease_qlen() should stop.
  */
+#define TCQ_F_NOLOCK   0x80 /* qdisc does not require locking */
u32 limit;
const struct Qdisc_ops  *ops;
struct qdisc_size_table __rcu *stab;
diff --git a/net/core/dev.c b/net/core/dev.c
index b92d63b..f35d449 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3075,6 +3075,27 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, 
struct Qdisc *q,
int rc;
 
qdisc_calculate_pkt_len(skb, q);
+
+   if (q->flags & TCQ_F_NOLOCK) {
+   if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+   __qdisc_drop(skb, &to_free);
+   rc = NET_XMIT_DROP;
+   } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q)) {
+   qdisc_bstats_cpu_update(q, skb);
+   __qdisc_run(q);
+   if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
+   __qdisc_run(q);
+   rc = NET_XMIT_SUCCESS;
+   } else {
+   rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
+   __qdisc_run(q);
+   }
+
+   if (unlikely(to_free))
+   kfree_skb_list(to_free);
+   return rc;
+   }
+
/*
 * Heuristic to force contended enqueues to serialize on a
 * separate lock before trying to get qdisc main lock.
@@ -3896,19 +3917,22 @@ static void net_tx_action(struct softirq_action *h)
 
while (head) {
struct Qdisc *q = head;
-   spinlock_t *root_lock;
+   spinlock_t *root_lock = NULL;
 
head = head->next_sched;
 
-   root_lock = qdisc_lock(q);
-   spin_lock(root_lock);
+   if (!(q->flags & TCQ_F_NOLOCK)) {
+   root_lock = qdisc_lock(q);
+   spin_lock(root_lock);
+   }
/* We need to make sure head->next_sched is read
 * before clearing __QDISC_STATE_SCHED
 */
smp_mb__before_atomic();
clear_bit(__QDISC_STATE_SCHED, &q->state);
qdisc_run(q);
-   spin_unlock(root_lock);
+   if (!(q->flags & TCQ_F_NOLOCK))
+   spin_unlock(root_lock);
}
}
 }
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index e95b67c..2c3e23b 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -170,7 +170,8 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
int ret = NETDEV_TX_BUSY;
 
/* And release qdisc */
-   spin_unlock(root_lock);
+   if (!(q->flags & TCQ_F_NOLOCK))
+   spin_unlock(root_lock);
 
/* Note that we validate skb (GSO, checksum, ...) outside of locks */
if (validate)
@@ -183,10 +184,13 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
HARD_TX_UNLOCK(dev, txq);
} else {
-   spin_lock(root_lock);
+   if (!(q->flags & TCQ_F_NOLOCK))
+   spin_lock(root_lock);
return qdisc_qlen(q);
}
-   spin_lock(root_lock);
+
+   if (!(q->flags & TCQ_F_NOLOCK))
+   spin_lock(root_lock);
 
if (dev_xmit_complete(ret)) {
/* Driver sent out skb successfully or skb was consumed */
@@ -868,14 +872,18 @@ static bool some_qdisc_is_busy(struct net_device *dev)
 
dev_queue = netdev_get_tx_queue(dev, i);
q = dev_queue->qdisc_sleeping;
-   root_lock = qdisc_lock(q);
 
-   spin_lock_bh(root_lock);
+   i

[PATCH] mlxsw: spectrum_router: Return -ENOENT in case of error

2016-07-13 Thread Christophe JAILLET
'vr' should be a valid pointer here, so returning 'PTR_ERR(vr)' is wrong.
Return an explicit error code (-ENOENT) instead.

Signed-off-by: Christophe JAILLET 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index e084ea5..81418d6 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -1803,7 +1803,7 @@ int mlxsw_sp_router_fib4_del(struct mlxsw_sp_port 
*mlxsw_sp_port,
  sizeof(fib4->dst), fib4->dst_len);
if (!fib_entry) {
dev_warn(mlxsw_sp->bus_info->dev, "Failed to find FIB4 entry 
being removed.\n");
-   return PTR_ERR(vr);
+   return -ENOENT;
}
mlxsw_sp_fib_entry_del(mlxsw_sp_port->mlxsw_sp, fib_entry);
mlxsw_sp_fib_entry_remove(vr->fib, fib_entry);
-- 
2.7.4


---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel 
antivirus Avast.
https://www.avast.com/antivirus



Re: [PATCH v11 21/22] IB/hns: Kconfig and Makefile for RoCE module

2016-07-13 Thread Leon Romanovsky
On Sat, Jul 02, 2016 at 05:39:23PM +0800, Lijun Ou wrote:
> This patch added Kconfig and Makefile for building RoCE module.
> 
> Signed-off-by: Wei Hu 
> Signed-off-by: Nenglong Zhao 
> Signed-off-by: Lijun Ou 
> ---
> PATCH v11:
> hns_roce_icm.o -> hns_roce_hem.o
> 
> PATCH v10/v9/v8/v7/v6/v5:
> - No change over the PATCH v4
> 
> PATCH v4:
> This fixes the comments given by Christoph Hellwig over the PATCH v3:
>   Link: https://lkml.org/lkml/2016/3/22/609
> 
> PATCH V3:
> This fixes the comments given by Leon Romanovsky over the PATCH v2:
>   Link: https://lkml.org/lkml/2016/3/20/5
> 
> PATCH v2:
> This fixes the comments given by Leon Romanovsky over the PATCH v1:
>   Link: https://lkml.org/lkml/2016/3/6/94
> Fixes the error tested by kbuild test robot over the PATCH v1:
>   Link: https://lkml.org/lkml/2016/3/4/343
> 
> PATCH v1:
> - The initial patch
> ---
> ---
>  drivers/infiniband/Kconfig |  1 +
>  drivers/infiniband/hw/Makefile |  1 +
>  drivers/infiniband/hw/hns/Kconfig  | 10 ++
>  drivers/infiniband/hw/hns/Makefile |  8 
>  4 files changed, 20 insertions(+)
>  create mode 100644 drivers/infiniband/hw/hns/Kconfig
>  create mode 100644 drivers/infiniband/hw/hns/Makefile
> 
> diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
> index 2137adf..767f92b 100644
> --- a/drivers/infiniband/Kconfig
> +++ b/drivers/infiniband/Kconfig
> @@ -74,6 +74,7 @@ source "drivers/infiniband/hw/mlx5/Kconfig"
>  source "drivers/infiniband/hw/nes/Kconfig"
>  source "drivers/infiniband/hw/ocrdma/Kconfig"
>  source "drivers/infiniband/hw/usnic/Kconfig"
> +source "drivers/infiniband/hw/hns/Kconfig"
>  
>  source "drivers/infiniband/ulp/ipoib/Kconfig"
>  
> diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
> index c0c7cf8..2ad851d 100644
> --- a/drivers/infiniband/hw/Makefile
> +++ b/drivers/infiniband/hw/Makefile
> @@ -9,3 +9,4 @@ obj-$(CONFIG_INFINIBAND_NES)  += nes/
>  obj-$(CONFIG_INFINIBAND_OCRDMA)  += ocrdma/
>  obj-$(CONFIG_INFINIBAND_USNIC)   += usnic/
>  obj-$(CONFIG_INFINIBAND_HFI1)+= hfi1/
> +obj-$(CONFIG_INFINIBAND_HISILICON_HNS)   += hns/

--^^^--
There is no need in HISILICON word here.

> diff --git a/drivers/infiniband/hw/hns/Kconfig 
> b/drivers/infiniband/hw/hns/Kconfig
> new file mode 100644
> index 000..c47c168
> --- /dev/null
> +++ b/drivers/infiniband/hw/hns/Kconfig
> @@ -0,0 +1,10 @@
> +config INFINIBAND_HISILICON_HNS
> + tristate "Hisilicon Hns ROCE Driver"

And you are still inconsistent with the names
Hisilicon/HiSilicon/hisilicon/HISILICON/e.t.c., ROCE/roce/RoCE/e.t.c.


signature.asc
Description: Digital signature


Re: [PATCH v11 00/22] Add HiSilicon RoCE driver

2016-07-13 Thread Leon Romanovsky
On Thu, Jul 14, 2016 at 11:43:59AM +0800, oulijun wrote:
> 在 2016/7/2 17:39, Lijun Ou 写道:
> > 
> Hi, Doug & Sean Hefty & Hal Rosenstock
> "Hello, I understand that maintainer is dealing with lots of patches not just 
> mine. Also, I could not see any further review comments from the community.
>  I also understand that I should not resend the patch-set again unless I am 
> sure my patch-set is lost.
>  I was just wondering what should I do in the current circumstance where my 
> PATCH" has not activity.
>  I am not sure if this has been accepted or how much I need to wait to resend 
> it (if ever). Please guide, I am new to open-source and learning from people 
> like you. Thanks a lot :)

You was asked numerous times to clean the mess in your TO/CC fields.
Most of the people have nothing to do with your submission.
Understanding who is the RDMA maintainer will help you a lot (hint: it
is one of three in your opening sentence). Another request from you which
you successfully ignored, was to stop reply with whole email,
but reply with relevant part only.

Ignoring community rules is a good way to be ignored back.

BTW, you don't need to resend patches, please follow after patchwork status
https://patchwork.kernel.org/project/linux-rdma/list/?submitter=157841&state=1

> 
> Thanks
> Lijun Ou
> 
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


signature.asc
Description: Digital signature


Re: [PATCH 0/2] Code style fixes

2016-07-13 Thread David Miller
From: Elad Kanfi 
Date: Wed, 13 Jul 2016 16:58:05 +0300

> Fix all checkpatch warnings and errors, and reuse code

Series applied to net-next, thanks.


Re: [PATCH v11 00/22] Add HiSilicon RoCE driver

2016-07-13 Thread oulijun
在 2016/7/2 17:39, Lijun Ou 写道:
> The HiSilicon Network Substem is a long term evolution IP which is
> supposed to be used in HiSilicon ICT SoCs. HNS (HiSilicon Network
> Sybsystem) also has a hardware support of performing RDMA with
> RoCEE.
> The driver for HiSilicon RoCEE(RoCE Engine) is a platform driver and
> will support mulitple versions of SOCs in future. This version of driver
> is meant to support Hip06 SoC(which confirms to RoCEv1 hardware
> specifications).
> 
> Changes v10 -> v11:
> [1/22]:
> 1. modify the print description of chip don't support roce
> 2. remove explicit values for enums for patch series
> [3/22]:
> 3. remove non-essential headers for patch series
> 4. add judgement for port_cnt is zero
> 5. Keep unified print style for "set mask..." vs. "No usable
>..."
> 6. modify the MODULE_LICENSE
> 7. remove MODULE_ALIAS
> [4/22]:
> 8. Move this line out of if-else and leave "if (enable)" part only
> 9. renaming the meaningful definition to 20 for patch series
> 10. delete extern keyword for hns_dsaf_roce_reset function
> 11. delete void keyword for hr_dev->hw->reset when driver removed
> [5/22]:
> 12. remove few unnecessary variables and some lines.
> 13. remove the function for one line of code which will be called
> once only for patch series
> [6/22]:
> 14. redesign the method for calculating token_mask' value
> [7/22]:
> 15. delete hns_roce_status_to_errno
> 16. modify the one enum only for all patches
> 17. remove the spin_lock in hns_roce_cq_event function
> 18. add comment here that 0x10 and 0x11 in hns_roce_event struct
> 19. refactor hns_roce_aeq_int function and It has switch in switch
> and it is almost 200 LOCs
> 20. simplify the lines for err_out_free_pages branch
> [8/22]:
> 21. remove icm and redesign it for patch series
> 
> Changes v9 -> v10:
> 1. delete redundant lines which it is netdevice.h in hns_roce_main.c
> 2. adjust the indentation for HNS_ROCE_V1_NUM_ASYNC_EQE
> 3. simplify the lines in hns_roce_init_qp_table function
> 4. add static type for hns_roce_unregister_device
> 5. move the call with hns_roce_unregister_device from the tenth patch to
>the eleventh patch in hns_roce_remove function
> 6. readjuest the alphabetic order in MAINTAINERS
> 7. redesigned the way for getting irq names
> 8. avoid the memory leakage because mr->pbl is not free in
>hns_roce_mr function
> 9. avoid the memory leakage because not kfree table->icm when exception
> 10. add the link from LKML as well whose comment in all
> 
> Changes v8 -> v9:
> 1. delete the definition of ADDR_SHIFT_n, use literal 12, 32 and 44 and
>add comments
> 2. use roce_read/roce_write/readl/write instead of roce_readl/roce_writel
> 3. delete the print error/debug messages for memory allocation errors
> 4. use exit instead of uninit, for example hw->uninit -> hw->exit
> 5. use roce_raw_write instead of _raw_writel in eq_set_cons_index
> 6. modify the label with underscore
> 7. adjust the indentation for the macro definitions in hns_roce_hw_v1.c
> 8. simplify some lines in few functions and structures
> 9. adjust the alphabetic order in MAINTAINERS
> 
> Changes v7 -> v8:
> 1. add a verbs operation named get_port_immutable. It is an 
>independent patch
> 2. add a comment for the definition of ADDR_SHIFT_n, n are 12,32
>and 44
> 3. restructures the code to align with naming convention of the Linux
>according to the review of Doug Ledford
> 4. modify the state for all .c and .h files
> 
> Changes v6 -> v7:
> 1. modify some type of parameter, use bool replace the original type
> 2. add the Signed-off-by signatures in the first patch
> 3. delete the improper print sentence in hns_roce_create_eq.
> 
> Changes v5 -> v6:
> 1. modify the type of obj for unsigned long according the reviews, and
>modify the same questions in RoCE module
> 2. fix the spelling error
> 3. fix the Signed-off-by signatures
> 
> Changes v4 -> v5:
> 1. redesign the patchset for RoCE modules in order to split the huge
>patch into small patches
> 2. fix the directory path for RoCE module. Delete the hisilicon level.
> 3. modify the name of roce_v1_hw into roce_hw_v1
> 
> Changes v3 -> v4:
> 1. modify roce.o into hns-roce.o in Makefile and Kconfig file
> 
> Changes v2 -> v3:
> 1. modify the formats of RoCE driver code base v2 by the experts 
>reviewing. also, it used kmalloc_array instead of kmalloc, kcalloc
>instead of kzalloc, when refer to memory allocation for array
> 2. remove some functions without use and unconnected macros
> 3. modify the binding document with RoCE DT base v2 which added
>interrupt-names
> 4. redesign the port_map and si_map in hns_dsaf_roce_reset
> 5. add HiSilicon RoCE driver maintainers introduction in MAINTAINERS
>document
> 
> Changes v1 -> v2:
> 1. modify the formats of roce driver code by the experts reviewing
> 2. modify the bindings file with roce dts. add the attribute named 
>interrput-names.
> 3. modify the way of defining port mode in hns_dsaf_main.c
> 

[PATCH 2/2] net: nps_enet: code reuse

2016-07-13 Thread Elad Kanfi
From: Elad Kanfi 

Add inline function that checks if there is a pending tx packet.

Signed-off-by: Elad Kanfi 
---
 drivers/net/ethernet/ezchip/nps_enet.c |   21 +++--
 1 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/ezchip/nps_enet.c 
b/drivers/net/ethernet/ezchip/nps_enet.c
index b182e2a..25faa3d 100644
--- a/drivers/net/ethernet/ezchip/nps_enet.c
+++ b/drivers/net/ethernet/ezchip/nps_enet.c
@@ -24,6 +24,14 @@
 
 #define DRV_NAME   "nps_mgt_enet"
 
+static inline bool nps_enet_is_tx_pending(struct nps_enet_priv *priv)
+{
+   u32 tx_ctrl_value = nps_enet_reg_get(priv, NPS_ENET_REG_TX_CTL);
+   u32 tx_ctrl_ct = (tx_ctrl_value & TX_CTL_CT_MASK) >> TX_CTL_CT_SHIFT;
+
+   return (!tx_ctrl_ct && priv->tx_skb);
+}
+
 static void nps_enet_clean_rx_fifo(struct net_device *ndev, u32 frame_len)
 {
struct nps_enet_priv *priv = netdev_priv(ndev);
@@ -141,12 +149,11 @@ static void nps_enet_tx_handler(struct net_device *ndev)
 {
struct nps_enet_priv *priv = netdev_priv(ndev);
u32 tx_ctrl_value = nps_enet_reg_get(priv, NPS_ENET_REG_TX_CTL);
-   u32 tx_ctrl_ct = (tx_ctrl_value & TX_CTL_CT_MASK) >> TX_CTL_CT_SHIFT;
u32 tx_ctrl_et = (tx_ctrl_value & TX_CTL_ET_MASK) >> TX_CTL_ET_SHIFT;
u32 tx_ctrl_nt = (tx_ctrl_value & TX_CTL_NT_MASK) >> TX_CTL_NT_SHIFT;
 
/* Check if we got TX */
-   if (!priv->tx_skb || tx_ctrl_ct)
+   if (!nps_enet_is_tx_pending(priv))
return;
 
/* Ack Tx ctrl register */
@@ -184,9 +191,6 @@ static int nps_enet_poll(struct napi_struct *napi, int 
budget)
work_done = nps_enet_rx_handler(ndev);
if (work_done < budget) {
u32 buf_int_enable_value = 0;
-   u32 tx_ctrl_value = nps_enet_reg_get(priv, NPS_ENET_REG_TX_CTL);
-   u32 tx_ctrl_ct =
-   (tx_ctrl_value & TX_CTL_CT_MASK) >> TX_CTL_CT_SHIFT;
 
napi_complete(napi);
 
@@ -205,8 +209,7 @@ static int nps_enet_poll(struct napi_struct *napi, int 
budget)
 * the two code lines below will solve this situation by
 * re-adding ourselves to the poll list.
 */
-
-   if (priv->tx_skb && !tx_ctrl_ct) {
+   if (nps_enet_is_tx_pending(priv)) {
nps_enet_reg_set(priv, NPS_ENET_REG_BUF_INT_ENABLE, 0);
napi_reschedule(napi);
}
@@ -231,11 +234,9 @@ static irqreturn_t nps_enet_irq_handler(s32 irq, void 
*dev_instance)
struct net_device *ndev = dev_instance;
struct nps_enet_priv *priv = netdev_priv(ndev);
u32 rx_ctrl_value = nps_enet_reg_get(priv, NPS_ENET_REG_RX_CTL);
-   u32 tx_ctrl_value = nps_enet_reg_get(priv, NPS_ENET_REG_TX_CTL);
-   u32 tx_ctrl_ct = (tx_ctrl_value & TX_CTL_CT_MASK) >> TX_CTL_CT_SHIFT;
u32 rx_ctrl_cr = (rx_ctrl_value & RX_CTL_CR_MASK) >> RX_CTL_CR_SHIFT;
 
-   if ((!tx_ctrl_ct && priv->tx_skb) || rx_ctrl_cr)
+   if (nps_enet_is_tx_pending(priv) || rx_ctrl_cr)
if (likely(napi_schedule_prep(&priv->napi))) {
nps_enet_reg_set(priv, NPS_ENET_REG_BUF_INT_ENABLE, 0);
__napi_schedule(&priv->napi);
-- 
1.7.1



Re: [PATCH net-next 0/6] sctp: allow GSO frags to access the chunk too

2016-07-13 Thread David Miller
From: Marcelo Ricardo Leitner 
Date: Wed, 13 Jul 2016 15:08:54 -0300

> Patchset is named after the most important fix in it. First two patches
> are preparing the grounds for the 3rd patch.
> 
> After the 3rd, they are not strictly logically related to the patchset,
> but I kept them together as they depend on each other.
> 
> More details on patch changelogs.

Series applied, thanks.


Re: [PATCH -next] net: ethernet: bgmac: Remove redundant dev_err call in bgmac_probe()

2016-07-13 Thread David Miller
From: weiyj...@163.com
Date: Wed, 13 Jul 2016 12:46:57 +

> From: Wei Yongjun 
> 
> There is a error message within devm_ioremap_resource
> already, so remove the dev_err call to avoid redundant
> error message.
> 
> Signed-off-by: Wei Yongjun 

Applied.


Re: [PATCH -next] stmmac: dwmac-socfpga: remove redundant dev_err call in socfpga_dwmac_parse_data()

2016-07-13 Thread David Miller
From: weiyj...@163.com
Date: Wed, 13 Jul 2016 12:46:40 +

> From: Wei Yongjun 
> 
> There is a error message within devm_ioremap_resource
> already, so remove the dev_err call to avoid redundant
> error message.
> 
> Signed-off-by: Wei Yongjun 

Applied.


Re: [PATCH net-next] net: vrf: Address comments from last documentation update

2016-07-13 Thread David Miller
From: David Ahern 
Date: Wed, 13 Jul 2016 18:28:16 -0600

> Comments from Frank Kellerman on last doc update:
> - extra whitespace in front of a neigh show command
> - convert the brief link example to 'vrf red'
> 
> Signed-off-by: David Ahern 

Applied.


[PATCH net-next] net: vrf: Address comments from last documentation update

2016-07-13 Thread David Ahern
Comments from Frank Kellerman on last doc update:
- extra whitespace in front of a neigh show command
- convert the brief link example to 'vrf red'

Signed-off-by: David Ahern 
---
 Documentation/networking/vrf.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt
index 11a2b99bdbb9..755dab856392 100644
--- a/Documentation/networking/vrf.txt
+++ b/Documentation/networking/vrf.txt
@@ -189,7 +189,7 @@ older form without it.
 
 
Or using the brief output:
-   $ ip -br link show master red
+   $ ip -br link show vrf red
eth1 UP 02:00:00:00:02:02 

eth2 UP 02:00:00:00:02:03 

eth5 DOWN   02:00:00:00:02:06 
@@ -207,8 +207,8 @@ older form without it.
10.2.1.254 dev eth1 lladdr a6:d9:c7:4f:06:23 REACHABLE
10.2.2.254 dev eth2 lladdr 5e:54:01:6a:ee:80 REACHABLE
 
-$ ip -6 neigh show vrf red
-2002:1::64 dev eth1 lladdr a6:d9:c7:4f:06:23 REACHABLE
+   $ ip -6 neigh show vrf red
+   2002:1::64 dev eth1 lladdr a6:d9:c7:4f:06:23 REACHABLE
 
 
 6. Show Addresses for a VRF
-- 
2.7.4 (Apple Git-66)



[PATCH 1/2] net: ethernet: ll_temac: use phydev from struct net_device

2016-07-13 Thread Philippe Reynes
The private structure contain a pointer to phydev, but the structure
net_device already contain such pointer. So we can remove the pointer
phy in the private structure, and update the driver to use the
one contained in struct net_device.

Signed-off-by: Philippe Reynes 
---
 drivers/net/ethernet/xilinx/ll_temac.h  |1 -
 drivers/net/ethernet/xilinx/ll_temac_main.c |   37 +++---
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/ll_temac.h 
b/drivers/net/ethernet/xilinx/ll_temac.h
index 902457e..7d06e3e 100644
--- a/drivers/net/ethernet/xilinx/ll_temac.h
+++ b/drivers/net/ethernet/xilinx/ll_temac.h
@@ -332,7 +332,6 @@ struct temac_local {
struct device *dev;
 
/* Connection to PHY device */
-   struct phy_device *phy_dev; /* Pointer to PHY device */
struct device_node *phy_node;
 
/* MDIO bus data */
diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c 
b/drivers/net/ethernet/xilinx/ll_temac_main.c
index 7397087..8d6a178 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -590,7 +590,7 @@ static void temac_device_reset(struct net_device *ndev)
 static void temac_adjust_link(struct net_device *ndev)
 {
struct temac_local *lp = netdev_priv(ndev);
-   struct phy_device *phy = lp->phy_dev;
+   struct phy_device *phy = ndev->phydev;
u32 mii_speed;
int link_state;
 
@@ -843,19 +843,20 @@ static irqreturn_t ll_temac_rx_irq(int irq, void *_ndev)
 static int temac_open(struct net_device *ndev)
 {
struct temac_local *lp = netdev_priv(ndev);
+   struct phy_device *phydev = NULL;
int rc;
 
dev_dbg(&ndev->dev, "temac_open()\n");
 
if (lp->phy_node) {
-   lp->phy_dev = of_phy_connect(lp->ndev, lp->phy_node,
-temac_adjust_link, 0, 0);
-   if (!lp->phy_dev) {
+   phydev = of_phy_connect(lp->ndev, lp->phy_node,
+   temac_adjust_link, 0, 0);
+   if (!phydev) {
dev_err(lp->dev, "of_phy_connect() failed\n");
return -ENODEV;
}
 
-   phy_start(lp->phy_dev);
+   phy_start(phydev);
}
 
temac_device_reset(ndev);
@@ -872,9 +873,8 @@ static int temac_open(struct net_device *ndev)
  err_rx_irq:
free_irq(lp->tx_irq, ndev);
  err_tx_irq:
-   if (lp->phy_dev)
-   phy_disconnect(lp->phy_dev);
-   lp->phy_dev = NULL;
+   if (phydev)
+   phy_disconnect(phydev);
dev_err(lp->dev, "request_irq() failed\n");
return rc;
 }
@@ -882,15 +882,15 @@ static int temac_open(struct net_device *ndev)
 static int temac_stop(struct net_device *ndev)
 {
struct temac_local *lp = netdev_priv(ndev);
+   struct phy_device *phydev = ndev->phydev;
 
dev_dbg(&ndev->dev, "temac_close()\n");
 
free_irq(lp->tx_irq, ndev);
free_irq(lp->rx_irq, ndev);
 
-   if (lp->phy_dev)
-   phy_disconnect(lp->phy_dev);
-   lp->phy_dev = NULL;
+   if (phydev)
+   phy_disconnect(phydev);
 
temac_dma_bd_release(ndev);
 
@@ -916,15 +916,13 @@ temac_poll_controller(struct net_device *ndev)
 
 static int temac_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
 {
-   struct temac_local *lp = netdev_priv(ndev);
-
if (!netif_running(ndev))
return -EINVAL;
 
-   if (!lp->phy_dev)
+   if (!ndev->phydev)
return -EINVAL;
 
-   return phy_mii_ioctl(lp->phy_dev, rq, cmd);
+   return phy_mii_ioctl(ndev->phydev, rq, cmd);
 }
 
 static const struct net_device_ops temac_netdev_ops = {
@@ -971,20 +969,17 @@ static const struct attribute_group temac_attr_group = {
 /* ethtool support */
 static int temac_get_settings(struct net_device *ndev, struct ethtool_cmd *cmd)
 {
-   struct temac_local *lp = netdev_priv(ndev);
-   return phy_ethtool_gset(lp->phy_dev, cmd);
+   return phy_ethtool_gset(ndev->phydev, cmd);
 }
 
 static int temac_set_settings(struct net_device *ndev, struct ethtool_cmd *cmd)
 {
-   struct temac_local *lp = netdev_priv(ndev);
-   return phy_ethtool_sset(lp->phy_dev, cmd);
+   return phy_ethtool_sset(ndev->phydev, cmd);
 }
 
 static int temac_nway_reset(struct net_device *ndev)
 {
-   struct temac_local *lp = netdev_priv(ndev);
-   return phy_start_aneg(lp->phy_dev);
+   return phy_start_aneg(ndev->phydev);
 }
 
 static const struct ethtool_ops temac_ethtool_ops = {
-- 
1.7.4.4



[PATCH 2/2] net: ethernet: ll_temac: use phy_ethtool_{get|set}_link_ksettings

2016-07-13 Thread Philippe Reynes
There are two generics functions phy_ethtool_{get|set}_link_ksettings,
so we can use them instead of defining the same code in the driver.

Signed-off-by: Philippe Reynes 
---
 drivers/net/ethernet/xilinx/ll_temac_main.c |   14 ++
 1 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c 
b/drivers/net/ethernet/xilinx/ll_temac_main.c
index 8d6a178..a9bd665 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -967,27 +967,17 @@ static const struct attribute_group temac_attr_group = {
 };
 
 /* ethtool support */
-static int temac_get_settings(struct net_device *ndev, struct ethtool_cmd *cmd)
-{
-   return phy_ethtool_gset(ndev->phydev, cmd);
-}
-
-static int temac_set_settings(struct net_device *ndev, struct ethtool_cmd *cmd)
-{
-   return phy_ethtool_sset(ndev->phydev, cmd);
-}
-
 static int temac_nway_reset(struct net_device *ndev)
 {
return phy_start_aneg(ndev->phydev);
 }
 
 static const struct ethtool_ops temac_ethtool_ops = {
-   .get_settings = temac_get_settings,
-   .set_settings = temac_set_settings,
.nway_reset = temac_nway_reset,
.get_link = ethtool_op_get_link,
.get_ts_info = ethtool_op_get_ts_info,
+   .get_link_ksettings = phy_ethtool_get_link_ksettings,
+   .set_link_ksettings = phy_ethtool_set_link_ksettings,
 };
 
 static int temac_of_probe(struct platform_device *op)
-- 
1.7.4.4



Re: pull request: bluetooth-next 2016-07-13

2016-07-13 Thread David Miller
From: Johan Hedberg 
Date: Wed, 13 Jul 2016 11:25:40 +0300

> Here's our main bluetooth-next pull request for the 4.8 kernel:
> 
>  - Fixes and cleanups in 802.15.4 and 6LoWPAN code
>  - Fix out of bounds issue in btmrvl driver
>  - Fixes to Bluetooth socket recvmsg return values
>  - Use crypto_cipher_encrypt_one() instead of crypto_skcipher
>  - Cleanup of Bluetooth connection sysfs interface
>  - New Authentication failure reson code for Disconnected mgmt event
>  - New USB IDs for Atheros, Qualcomm and Intel Bluetooth controllers
> 
> Please let me know if there are any issues pulling. Thanks.

Pulled, thanks Johan.


Re: [net-next PATCH 2/3] pktgen: add sample script pktgen_sample05_flow_per_thread.sh

2016-07-13 Thread Alexei Starovoitov
On Wed, Jul 13, 2016 at 10:06:10PM +0200, Jesper Dangaard Brouer wrote:
> This pktgen sample script is useful for scalability testing a
> receiver.  The script will simply generate one flow per
> thread (option -t N) using the thread number as part of the
> source IP-address.
> 
> The single flow sample (pktgen_sample03_burst_single_flow.sh)
> have become quite popular, but it is important that developers
> also make sure to benchmark scalability of multiple receive
> queues.
> 
> Signed-off-by: Jesper Dangaard Brouer 
> ---
>  samples/pktgen/pktgen_sample05_flow_per_thread.sh |   81 
> +
>  1 file changed, 81 insertions(+)
...
> +# Setup source IP-addresses based on thread number
> +pg_set $dev "src_min 198.18.$((thread+1)).1"
> +pg_set $dev "src_max 198.18.$((thread+1)).1"

I have similar script that uses udp_src_min/max to change
port number, since port is easier to match on the target host
and we don't use ipv4 ;)
but this script is also good improvement. Thanks!
Acked-by: Alexei Starovoitov 



Re: [PATCH] rndis_host: Set random MAC for ZTE MF910

2016-07-13 Thread Bjørn Mork
Kristian Evensen  writes:

> From: Kristian Evensen 
>
> All ZTE MF910 mifis, at least on some revisions, export the same MAC
> address (36:4b:50:b7:ef:da). Check for this MAC address and set a random
> MAC if detected.
>
> Also, changed the memcpy() to ether_addr_copy(), as pointed out by
> checkpatch.
>
> Signed-off-by: Kristian Evensen 
> ---
>  drivers/net/usb/rndis_host.c | 9 -
>  1 file changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c
> index 524a47a281..85bdbdf 100644
> --- a/drivers/net/usb/rndis_host.c
> +++ b/drivers/net/usb/rndis_host.c
> @@ -295,6 +295,9 @@ static const struct net_device_ops rndis_netdev_ops = {
>   .ndo_validate_addr  = eth_validate_addr,
>  };
>  
> +/* well-known buggy ZTE MF910 MAC address */
> +static const u8 buggy_zte_addr[ETH_ALEN] = {0x36, 0x4b, 0x50, 0xb7, 0xef, 
> 0xda};
> +
>  int
>  generic_rndis_bind(struct usbnet *dev, struct usb_interface *intf, int flags)
>  {
> @@ -428,7 +431,11 @@ generic_rndis_bind(struct usbnet *dev, struct 
> usb_interface *intf, int flags)
>   dev_err(&intf->dev, "rndis get ethaddr, %d\n", retval);
>   goto halt_fail_and_release;
>   }
> - memcpy(net->dev_addr, bp, ETH_ALEN);
> +
> + if (ether_addr_equal(bp, buggy_zte_addr))
> + eth_hw_addr_random(net);
> + else
> + ether_addr_copy(net->dev_addr, bp);
>  
>   /* set a nonzero filter to enable data transfers */
>   memset(u.set, 0, sizeof *u.set);


Or how about the more generic?:

if (bp[0] & 0x02)
eth_hw_addr_random(net);
else
ether_addr_copy(net->dev_addr, bp);

That would catch similar screwups from other vendors too.


Bjørn


Re: [PATCH RFC 3/3] ARM64: dts: meson-gxbb: use the new meson8b DWMAC glue

2016-07-13 Thread Kevin Hilman
Michael Turquette  writes:

> Hi Martin,
>
> Quoting Martin Blumenstingl (2016-06-27 04:33:49)
>> On Mon, Jun 27, 2016 at 12:44 PM, Martin Blumenstingl
>>  wrote:
>> > On Mon, Jun 27, 2016 at 11:24 AM, Carlo Caione  wrote:
>> >> A syscon is a region containing a set of miscellaneous registers used
>> >> for several reasons by several devices [1]. It this case there is really
>> >> no need to define a new syscon node since those two registers are only
>> >> used by your driver.
>> > I can easily change it back if that's the way to go.
>> > Before I do that: could you please confirm that "mp2_clk_out" (which
>> > is controlled by PRG_ETH0/offset 0x0 bits 7-9) is not something which
>> > has to be available through the common clk framework?
>> there was just an IRC discussion with Carlo on this topic:
>> We tried to find whether PRG_ETH0 is used to actually configure
>> "mp2_clk_out". Carlo brought up that it could also be the case that
>> the ethernet block simply needs to be informed about the rate of the
>> mp2_clk_out (which is *probably* the "mpll2" clock).
>> 
>> I'm adding Michael Turquette to this mail, maybe you can comment on this 
>> topic.
>> 
>> If it turns out that the etthernet block just has to know about the
>> clock rate then we have two tasks:
>> 1. identify why the mpll2 rate returns 0 on my GXBB device
>
> This is in progress, but turns out it doesn't matter for Ethernet. Bit 4
> in PRG_ETHERNET_ADDR0 control a mux clock inside of the Ethernet
> controller.
>
> A value of 0x0 selects fclk_div2 and a value of 0x1 selects mp2_clk_out.
> The bootloader programs in sets the mux to zero, or fclk_div2 as the
> input clock (which runs at 1GHz).
>
>> 2. change my patch so the new DWMAC glue gets a reference to the mpll2
>> clock and then use "clk_get_rate(mpll2) / (250 * 100)" to
>> configure the PRG_ETH0_MP2_CLK bits.
>
> Hmm, I'm not sure about that part. Bits 7-9 is a divider that further
> divides the clock signal selected by bit 4. This is set to 0x4, which
> means we divide the 1GHz fclk_div2 down to 250MHz, which seems to be the
> expected value coming out of this divider.
>
> I haven't looked further to see if there is a further programmable
> divider to divide 250MHz down to 50MHz, or (more likely) there is simply
> a fixed-factor divide-by-5 that results in the 50MHz rate consumed by
> the PHY.
>
> Modeling this all in the mmc driver makes sense. So we would have:
>
> struct clk_mux clk_m250_sel ->
>   struct clk_divider clk_m250_div ->
>   struct clk_fixed_factor enet_phy_clk

There's also bit 10: "Generate 25MHz clock for PHY" (which is set to 1
by the bootloaders on P200 and odroidc2)

This suggests that it might not be a fixed-factor divide-by-5 but a
choice between a divide-by-5 and a divide-by-10 for the PHY clock.

Kevin


[PATCH net-next 1/2] devlink: fix build error for CONFIG_MODULES=n

2016-07-13 Thread Arnd Bergmann
A driver calling trace_devlink_hwmsg cannot be built when modules are disabled:

include/trace/events/devlink.h: In function 
'trace_event_get_offsets_devlink_hwmsg':
include/trace/events/devlink.h:25:51: error: dereferencing pointer to 
incomplete type 'struct module'
   __string(owner_name, devlink->dev->driver->owner->name)

This changes the code to only print the module name when modules are actually
enabled, otherwise we hardcode the string "built-in".

Signed-off-by: Arnd Bergmann 
Fixes: e5224f0fe2ac ("devlink: add hardware messages tracing facility")
---
 include/trace/events/devlink.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h
index 333c32ac9bfa..26f92d3c7e9c 100644
--- a/include/trace/events/devlink.h
+++ b/include/trace/events/devlink.h
@@ -22,7 +22,11 @@ TRACE_EVENT(devlink_hwmsg,
TP_STRUCT__entry(
__string(bus_name, devlink->dev->bus->name)
__string(dev_name, dev_name(devlink->dev))
+#ifdef CONFIG_MODULES
__string(owner_name, devlink->dev->driver->owner->name)
+#else
+   __string(owner_name, "built-in")
+#endif
__field(bool, incoming)
__field(unsigned long, type)
__dynamic_array(u8, buf, len)
@@ -32,7 +36,11 @@ TRACE_EVENT(devlink_hwmsg,
TP_fast_assign(
__assign_str(bus_name, devlink->dev->bus->name);
__assign_str(dev_name, dev_name(devlink->dev));
+#ifdef CONFIG_MODULES
__assign_str(owner_name, devlink->dev->driver->owner->name);
+#else
+   __assign_str(owner_name, "built-in");
+#endif
__entry->incoming = incoming;
__entry->type = type;
memcpy(__get_dynamic_array(buf), buf, len);
-- 
2.9.0



[PATCH net-next 2/2] devlink: fix trace format string

2016-07-13 Thread Arnd Bergmann
Including devlink.h on ARM and probably other 32-bit architectures results in
a harmless warning:

In file included from ../include/trace/define_trace.h:95:0,
 from ../include/trace/events/devlink.h:51,
 from ../net/core/devlink.c:30:
include/trace/events/devlink.h: In function 'trace_raw_output_devlink_hwmsg':
include/trace/events/devlink.h:42:12: error: format '%lu' expects argument of 
type 'long unsigned int', but argument 10 has type 'size_t {aka unsigned int}' 
[-Werror=format=]

The correct format string for 'size_t' is %zu, not %lu, this works on all
architectures.

Signed-off-by: Arnd Bergmann 
Fixes: e5224f0fe2ac ("devlink: add hardware messages tracing facility")
---
 include/trace/events/devlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h
index 26f92d3c7e9c..4b75a6f986fc 100644
--- a/include/trace/events/devlink.h
+++ b/include/trace/events/devlink.h
@@ -47,7 +47,7 @@ TRACE_EVENT(devlink_hwmsg,
__entry->len = len;
),
 
-   TP_printk("bus_name=%s dev_name=%s owner_name=%s incoming=%d type=%lu 
buf=0x[%*phD] len=%lu",
+   TP_printk("bus_name=%s dev_name=%s owner_name=%s incoming=%d type=%lu 
buf=0x[%*phD] len=%zu",
  __get_str(bus_name), __get_str(dev_name),
  __get_str(owner_name), __entry->incoming, __entry->type,
  (int) __entry->len, __get_dynamic_array(buf), __entry->len)
-- 
2.9.0



Re: [PATCH RFC 3/3] ARM64: dts: meson-gxbb: use the new meson8b DWMAC glue

2016-07-13 Thread Michael Turquette
Hi Martin,

Quoting Martin Blumenstingl (2016-06-27 04:33:49)
> On Mon, Jun 27, 2016 at 12:44 PM, Martin Blumenstingl
>  wrote:
> > On Mon, Jun 27, 2016 at 11:24 AM, Carlo Caione  wrote:
> >> A syscon is a region containing a set of miscellaneous registers used
> >> for several reasons by several devices [1]. It this case there is really
> >> no need to define a new syscon node since those two registers are only
> >> used by your driver.
> > I can easily change it back if that's the way to go.
> > Before I do that: could you please confirm that "mp2_clk_out" (which
> > is controlled by PRG_ETH0/offset 0x0 bits 7-9) is not something which
> > has to be available through the common clk framework?
> there was just an IRC discussion with Carlo on this topic:
> We tried to find whether PRG_ETH0 is used to actually configure
> "mp2_clk_out". Carlo brought up that it could also be the case that
> the ethernet block simply needs to be informed about the rate of the
> mp2_clk_out (which is *probably* the "mpll2" clock).
> 
> I'm adding Michael Turquette to this mail, maybe you can comment on this 
> topic.
> 
> If it turns out that the etthernet block just has to know about the
> clock rate then we have two tasks:
> 1. identify why the mpll2 rate returns 0 on my GXBB device

This is in progress, but turns out it doesn't matter for Ethernet. Bit 4
in PRG_ETHERNET_ADDR0 control a mux clock inside of the Ethernet
controller.

A value of 0x0 selects fclk_div2 and a value of 0x1 selects mp2_clk_out.
The bootloader programs in sets the mux to zero, or fclk_div2 as the
input clock (which runs at 1GHz).

> 2. change my patch so the new DWMAC glue gets a reference to the mpll2
> clock and then use "clk_get_rate(mpll2) / (250 * 100)" to
> configure the PRG_ETH0_MP2_CLK bits.

Hmm, I'm not sure about that part. Bits 7-9 is a divider that further
divides the clock signal selected by bit 4. This is set to 0x4, which
means we divide the 1GHz fclk_div2 down to 250MHz, which seems to be the
expected value coming out of this divider.

I haven't looked further to see if there is a further programmable
divider to divide 250MHz down to 50MHz, or (more likely) there is simply
a fixed-factor divide-by-5 that results in the 50MHz rate consumed by
the PHY.

Modeling this all in the mmc driver makes sense. So we would have:

struct clk_mux clk_m250_sel ->
struct clk_divider clk_m250_div ->
struct clk_fixed_factor enet_phy_clk

I don't know what the name should be for that last one, I just chose
enet_phy_clk since it illustrates the point. The updated docs suggest
that clk_m250_{sel,div} might be reasonable names for the mux and
divider.

Kevin and I just got this info from AmLogic earlier today. The next rev
of documentation should correct these register definitions.

Regards,
Mike


Re: 4.6.3 panic on nf_ct_delete (nf_conntrack)

2016-07-13 Thread nuclearcat

On 2016-07-13 23:21, Florian Westphal wrote:

nuclear...@nuclearcat.com  wrote:
Workload: pppoe server, 5k users on ppp interfaces. No actual 
SNAT/DNAT, but

using connmark and REDIRECT

[176412.990104] general protection fault:  [#1]
SMP


I assume that you did not see this before.

What was the last kernel version where you did not run into this?

Might help to narrow things down.
Difficult to say, because it was triggered also on 4.5.3 at 10 Jun, 
while i was running this kernel since May 10, and never had such issue 
before. Maybe some new traffic pattern caused this, or because 
interfaces saturated now, and might reach full bandwidth (800Mbps in 
bursts might reach 1G, and traffic will be dropped?).


Here is panic from 4.5.3:

[85867.255619] general protection fault:  [#1]
SMP

[85867.255939] Modules linked in:
cls_fw
act_police
cls_u32
sch_ingress
sch_sfq
sch_htb
netconsole
configfs
coretemp
nf_nat_pptp
nf_nat_proto_gre
nf_conntrack_pptp
nf_conntrack_proto_gre
pppoe
pppox
ppp_generic
slhc
tun
xt_REDIRECT
nf_nat_redirect
xt_TCPMSS
ipt_REJECT
nf_reject_ipv4
xt_set
ts_bm
xt_string
xt_connmark
xt_DSCP
xt_mark
xt_tcpudp
ip_set_hash_net
ip_set_hash_ip
ip_set
nfnetlink
iptable_mangle
iptable_filter
iptable_nat
nf_conntrack_ipv4
nf_defrag_ipv4
nf_nat_ipv4
nf_nat
nf_conntrack
ip_tables
x_tables
8021q
garp
mrp
stp
llc

[85867.263194] CPU: 7 PID: 0 Comm: swapper/7 Not tainted 
4.5.3-build-0100 #4
[85867.263397] Hardware name: HP ProLiant DL320e Gen8 v2, BIOS P80 
04/02/2015
[85867.263598] task: 880435584680 ti: 8804355a8000 task.ti: 
8804355a8000

[85867.263936] RIP: 0010:[]
[] nf_ct_delete+0x1a/0x1dc [nf_conntrack]
[85867.264343] RSP: 0018:8804474e3e80  EFLAGS: 00010282
[85867.264545] RAX: 8804021b3738 RBX: 8100 RCX: 
dead0200
[85867.264749] RDX:  RSI:  RDI: 
ffa00504021b36b0
[85867.264950] RBP: 8804474e3ec8 R08: 8804474e3f08 R09: 

[85867.265151] R10: 820090c0 R11: 0002 R12: 
ffa00504021b36b0
[85867.265351] R13:  R14:  R15: 
820090c8
[85867.265553] FS:  () GS:8804474e() 
knlGS:

[85867.265892] CS:  0010 DS:  ES:  CR0: 80050033
[85867.266092] CR2: 7fb170542dc8 CR3: 0200a000 CR4: 
001406e0

[85867.266295] Stack:
[85867.266490]  8804474e3ec0
810f996a
880435584680
8804474edc40

[85867.267057]  8100
a003d2b1
00fa
8804355ac000

[85867.267624]  820090c8
8804474e3ed8
a003d2be
8804474e3ef8

[85867.268192] Call Trace:
[85867.268392]  

[85867.268456]  [] ? hrtimer_forward+0xd5/0xeb
[85867.268857]  [] ? nf_ct_delete+0x1dc/0x1dc 
[nf_conntrack]
[85867.269062]  [] death_by_timeout+0xd/0xf 
[nf_conntrack]

[85867.269265]  [] call_timer_fn.isra.26+0x17/0x6d
[85867.269468]  [] run_timer_softirq+0x176/0x197
[85867.269672]  [] __do_softirq+0xb9/0x1a9
[85867.269873]  [] irq_exit+0x37/0x7c
[85867.270077]  [] smp_apic_timer_interrupt+0x3d/0x48
[85867.270282]  [] apic_timer_interrupt+0x7c/0x90
[85867.270484]  

[85867.270546]  [] ? mwait_idle+0x64/0x7a
[85867.270943]  [] arch_cpu_idle+0xa/0xc
[85867.271144]  [] default_idle_call+0x27/0x29
[85867.271345]  [] cpu_startup_entry+0x11f/0x1c9
[85867.271548]  [] start_secondary+0xf1/0xf4
[85867.271750] Code:
e8
35
60
08
e1
58
5b
41
5c
41
5d
41
5e
41
5f
5d
c3
55
48
89
e5
41
57
41
56
41
55
41
54
41
89
f5
53
49
89
fc
41
89
d6
48
83
ec
20

8b
9f
c8
00
00
00
48
85
db
74
20
0f
b7
43
1c
66
85
c0
74
17

[85867.275937] RIP
[] nf_ct_delete+0x1a/0x1dc [nf_conntrack]
[85867.276200]  RSP 
[85867.276423] ---[ end trace 7be551057bff38cd ]---
[85867.285767] Kernel panic - not syncing: Fatal exception in interrupt
[85867.285973] Kernel Offset: disabled
[85867.319076] Rebooting in 5 seconds..


Re: [PATCH 1/1] tracing, bpf: Implement function bpf_probe_write

2016-07-13 Thread Sargun Dhillon


On Wed, 13 Jul 2016, Alexei Starovoitov wrote:

> On Wed, Jul 13, 2016 at 03:36:11AM -0700, Sargun Dhillon wrote:
>> Provides BPF programs, attached to kprobes a safe way to write to
>> memory referenced by probes. This is done by making probe_kernel_write
>> accessible to bpf functions via the bpf_probe_write helper.
>
> not quite :)
>
>> Signed-off-by: Sargun Dhillon 
>> ---
>>  include/uapi/linux/bpf.h  |  3 +++
>>  kernel/trace/bpf_trace.c  | 20 
>>  samples/bpf/bpf_helpers.h |  2 ++
>>  3 files changed, 25 insertions(+)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 406459b..355b565 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -313,6 +313,9 @@ enum bpf_func_id {
>>   */
>>   BPF_FUNC_skb_get_tunnel_opt,
>>   BPF_FUNC_skb_set_tunnel_opt,
>> +
>> + BPF_FUNC_probe_write, /* int bpf_probe_write(void *dst, void *src,
>> int size) */
>> +
>
> the patch is against some old kernel.
> Please always make the patch against net-next tree and cc netdev list.
>
Sorry, I did this against Linus's tree, not net-next. Will fix.

>> +static u64 bpf_probe_write(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
>> +{
>> + void *dst = (void *) (long) r1;
>> + void *unsafe_ptr = (void *) (long) r2;
>> + int  size = (int) r3;
>> +
>> + return probe_kernel_write(dst, unsafe_ptr, size);
>> +}
>
> the patch is whitepsace mangled. Please see 
> Documentation/networking/netdev-FAQ.txt
Also will fix.

>
> the main issue though that we cannot simply allow bpf to do probe_write,
> since it may crash the kernel.
> What might be ok is to allow writing into memory of current
> user space process only. This way bpf prog will keep kernel safety guarantees,
> yet it will be able to modify user process memory when necessary.
> Since bpf+tracing is root only, it doesn't pose security risk.
>
>

Doesn't probe_write prevent you from writing to protected memory and 
generate an EFAULT? Or are you worried about the situation where a bpf 
program writes to some other chunk of kernel memory, or writes bad data 
to said kernel memory?

I guess when I meant "safe" -- it's safer than allowing arbitrary memcpy. 
I don't see a good way to ensure safety otherwise as we don't know 
which registers point to memory that it's reasonable for probes to 
manipulate. It's not like skb_store_bytes where we can check the pointer 
going in is the same pointer that's referenced, and with a super 
restricted datatype.

Perhaps, it would be a good idea to describe an example where I used this:
#include 
#include 
#include 


int trace_inet_stream_connect(struct pt_regs *ctx)
{
if (!PT_REGS_PARM2(ctx)) {
return 0;
}
struct sockaddr uaddr = {};
struct sockaddr_in *addr_in;
bpf_probe_read(&uaddr, sizeof(struct sockaddr), (void 
*)PT_REGS_PARM2(ctx));
if (uaddr.sa_family == AF_INET) {
// Simple cast causes LLVM weirdness
addr_in = &uaddr;
char fmt[] = "Connecting on port: %d\n";
bpf_trace_printk(fmt, sizeof(fmt), ntohs(addr_in->sin_port));
if (ntohs(addr_in->sin_port) == 80) {
addr_in->sin_port = htons(443);
bpf_probe_write((void *)PT_REGS_PARM2(ctx), &uaddr, 
sizeof(uaddr));
}
}
return 0;
};

There are two reasons I want to do this:
1) Debugging - sometimes, it makes sense to divert a program's syscalls in 
order to allow for better debugging
2) Network Functions - I wrote a load balancer which intercepts 
inet_stream_connect & tcp_set_state. We can manipulate the destination 
address as neccessary at connect time. This also has the nice side effect 
that getpeername() returns the real IP that a server is connected to, and 
the performance is far better than doing "network load balancing"

(I realize this is a total hack, better approaches would be appreciated)

If we allowed manipulation of the current task's user memory by exposing 
copy_to_user, that could also work if I attach the probe to sys_connect, 
I could overwrite the address there before it gets copied into 
kernel space, but that could lead to its own weirdness.

Any ideas?


Re: 4.6.3 panic on nf_ct_delete (nf_conntrack)

2016-07-13 Thread Florian Westphal
nuclear...@nuclearcat.com  wrote:
> Workload: pppoe server, 5k users on ppp interfaces. No actual SNAT/DNAT, but
> using connmark and REDIRECT
> 
> [176412.990104] general protection fault:  [#1]
> SMP

I assume that you did not see this before.

What was the last kernel version where you did not run into this?

Might help to narrow things down.


4.6.3 panic on nf_ct_delete (nf_conntrack)

2016-07-13 Thread nuclearcat
Workload: pppoe server, 5k users on ppp interfaces. No actual SNAT/DNAT, 
but using connmark and REDIRECT


[176412.990104] general protection fault:  [#1]
SMP

[176412.990424] Modules linked in:
sch_pie
cls_fw
act_police
cls_u32
sch_ingress
sch_sfq
sch_htb
netconsole

[176412.991427]  configfs
coretemp
nf_nat_pptp
nf_nat_proto_gre
nf_conntrack_pptp
nf_conntrack_proto_gre
pppoe
pppox

[176412.992571]  ppp_generic
slhc

[176412.993218]  tun
xt_REDIRECT
nf_nat_redirect
xt_TCPMSS
ipt_REJECT
nf_reject_ipv4
xt_set
ts_bm
xt_string
xt_connmark
xt_DSCP
xt_mark
xt_tcpudp
ip_set_hash_net
ip_set_hash_ip
ip_set
nfnetlink
iptable_mangle
iptable_filter
iptable_nat
nf_conntrack_ipv4
nf_defrag_ipv4
nf_nat_ipv4
nf_nat
nf_conntrack
ip_tables
x_tables
8021q

[176412.996208]  garp
mrp
stp
llc

[176412.996834] CPU: 5 PID: 0 Comm: swapper/5 Not tainted 
4.6.3-build-0105 #4
[176412.997037] Hardware name: HP ProLiant DL320e Gen8 v2, BIOS P80 
04/02/2015
[176412.997241] task: 88043558af00 ti: 8804355a task.ti: 
8804355a

[176412.997580] RIP: 0010:[]
[] nf_ct_delete+0x26/0x1dc [nf_conntrack]
[176412.997985] RSP: 0018:8804474a3e80  EFLAGS: 00010282
[176412.998187] RAX: 880428bc0c90 RBX: ffac050402505080 RCX: 
dead0200
[176412.998524] RDX:  RSI:  RDI: 
880428bc0c08
[176412.998865] RBP: 8804474a3ec8 R08: 8804474a3f08 R09: 

[176412.999204] R10: 820050c0 R11: 049a R12: 
880428bc0c08
[176412.999545] R13:  R14:  R15: 
820050c8
[176412.999885] FS:  () GS:8804474a() 
knlGS:

[176413.000226] CS:  0010 DS:  ES:  CR0: 80050033
[176413.000427] CR2: 7f1dc4960100 CR3: 02006000 CR4: 
001406e0

[176413.000767] Stack:
[176413.000963]  8804474a3ec0
810fb036
88043558af07
8804474adcc0

[176413.001534]  8100
a003d2ad
00a1
8804355a4000

[176413.002097]  820050c8
8804474a3ed8
a003d2ba
8804474a3ef8

[176413.002666] Call Trace:
[176413.002862]  

[176413.002926]  [] ? hrtimer_forward+0xd5/0xeb
[176413.003322]  [] ? nf_ct_delete+0x1dc/0x1dc 
[nf_conntrack]
[176413.003525]  [] death_by_timeout+0xd/0xf 
[nf_conntrack]

[176413.003727]  [] call_timer_fn.isra.26+0x17/0x6d
[176413.003931]  [] run_timer_softirq+0x176/0x197
[176413.004134]  [] __do_softirq+0xb9/0x1a9
[176413.004333]  [] irq_exit+0x37/0x7c
[176413.004533]  [] smp_apic_timer_interrupt+0x3d/0x48
[176413.004734]  [] apic_timer_interrupt+0x7c/0x90
[176413.004935]  

[176413.004997]  [] ? mwait_idle+0x68/0x7e
[176413.005391]  [] arch_cpu_idle+0xa/0xc
[176413.005592]  [] default_idle_call+0x27/0x29
[176413.005791]  [] cpu_startup_entry+0x115/0x1bf
[176413.005993]  [] start_secondary+0xf1/0xf4
[176413.006193] Code:
5e
41
5f
5d
c3
55
48
89
e5
41
57
41
56
41
55
41
54
41
89
f5
53
49
89
fc
41
89
d6
48
83
ec
20
48
8b
9f
c8
00
00
00
48
85
db
74
20
b7
43
1c
66
85
c0
74
17
48
01
c3
74
12
48
83
7b
08
00
75
0b

[176413.010382] RIP
[] nf_ct_delete+0x26/0x1dc [nf_conntrack]
[176413.010643]  RSP 
[176413.010855] ---[ end trace cf1060fc5087293e ]---
[176413.018573] Kernel panic - not syncing: Fatal exception in interrupt
[176413.018781] Kernel Offset: disabled
[176413.046284] ERST: [Firmware Warn]: Firmware does not respond in 
time.

[176413.050041] Rebooting in 5 seconds..


[net-next PATCH 0/3] pktgen samples: new scripts and removing older samples

2016-07-13 Thread Jesper Dangaard Brouer
This patchset is adding some pktgen sample scripts that I've been
using for a while[1], and they seams to relevant for more people.

Patchset also remove some of the older style pktgen samples.

[1] https://github.com/netoptimizer/network-testing/tree/master/pktgen

---

Jesper Dangaard Brouer (3):
  pktgen: add sample script pktgen_sample04_many_flows.sh
  pktgen: add sample script pktgen_sample05_flow_per_thread.sh
  pktgen: remove sample script pktgen.conf-1-1-rdos


 samples/pktgen/pktgen.conf-1-1-flows  |   67 ---
 samples/pktgen/pktgen.conf-1-1-rdos   |   64 --
 samples/pktgen/pktgen_sample04_many_flows.sh  |   93 +
 samples/pktgen/pktgen_sample05_flow_per_thread.sh |   81 ++
 4 files changed, 174 insertions(+), 131 deletions(-)
 delete mode 100755 samples/pktgen/pktgen.conf-1-1-flows
 delete mode 100755 samples/pktgen/pktgen.conf-1-1-rdos
 create mode 100755 samples/pktgen/pktgen_sample04_many_flows.sh
 create mode 100755 samples/pktgen/pktgen_sample05_flow_per_thread.sh

--


[net-next PATCH 2/3] pktgen: add sample script pktgen_sample05_flow_per_thread.sh

2016-07-13 Thread Jesper Dangaard Brouer
This pktgen sample script is useful for scalability testing a
receiver.  The script will simply generate one flow per
thread (option -t N) using the thread number as part of the
source IP-address.

The single flow sample (pktgen_sample03_burst_single_flow.sh)
have become quite popular, but it is important that developers
also make sure to benchmark scalability of multiple receive
queues.

Signed-off-by: Jesper Dangaard Brouer 
---
 samples/pktgen/pktgen_sample05_flow_per_thread.sh |   81 +
 1 file changed, 81 insertions(+)
 create mode 100755 samples/pktgen/pktgen_sample05_flow_per_thread.sh

diff --git a/samples/pktgen/pktgen_sample05_flow_per_thread.sh 
b/samples/pktgen/pktgen_sample05_flow_per_thread.sh
new file mode 100755
index ..32ad818e2829
--- /dev/null
+++ b/samples/pktgen/pktgen_sample05_flow_per_thread.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+#
+# Script will generate one flow per thread (-t N)
+#  - Same destination IP
+#  - Fake source IPs for each flow (fixed based on thread number)
+#
+# Useful for scale testing on receiver, to see whether silo'ing flows
+# works and scales.  For optimal scalability (on receiver) each
+# separate-flow should not access shared variables/data. This script
+# helps magnify any of these scaling issues by overloading the receiver.
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+source ${basedir}/parameters.sh
+# Set some default params, if they didn't get set
+[ -z "$DEST_IP" ]   && DEST_IP="198.18.0.42"
+[ -z "$DST_MAC" ]   && DST_MAC="90:e2:ba:ff:ff:ff"
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
+[ -z "$BURST" ] && BURST=32
+
+
+# Base Config
+DELAY="0"  # Zero means max speed
+COUNT="0"  # Zero means indefinitely
+
+# General cleanup everything since last run
+pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = 0; thread < $THREADS; thread++)); do
+dev=${DEV}@${thread}
+
+# Add remove all other devices and add_device $dev to thread
+pg_thread $thread "rem_device_all"
+pg_thread $thread "add_device" $dev
+
+# Base config
+pg_set $dev "flag QUEUE_MAP_CPU"
+pg_set $dev "count $COUNT"
+pg_set $dev "clone_skb $CLONE_SKB"
+pg_set $dev "pkt_size $PKT_SIZE"
+pg_set $dev "delay $DELAY"
+pg_set $dev "flag NO_TIMESTAMP"
+
+# Single destination
+pg_set $dev "dst_mac $DST_MAC"
+pg_set $dev "dst $DEST_IP"
+
+# Setup source IP-addresses based on thread number
+pg_set $dev "src_min 198.18.$((thread+1)).1"
+pg_set $dev "src_max 198.18.$((thread+1)).1"
+
+# Setup burst, for easy testing -b 0 disable bursting
+# (internally in pktgen default and minimum burst=1)
+if [[ ${BURST} -ne 0 ]]; then
+   pg_set $dev "burst $BURST"
+else
+   info "$dev: Not using burst"
+fi
+
+done
+
+# Run if user hits control-c
+function print_result() {
+# Print results
+for ((thread = 0; thread < $THREADS; thread++)); do
+   dev=${DEV}@${thread}
+   echo "Device: $dev"
+   cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+done
+}
+# trap keyboard interrupt (Ctrl-C)
+trap true SIGINT
+
+echo "Running... ctrl^C to stop" >&2
+pg_ctrl "start"
+
+print_result



[PATCH v3 3/3] mac80211: mesh: fixed HT ies in beacon template

2016-07-13 Thread Yaniv Machani
The HT capab info field inside the HT capab IE of the mesh beacon
is incorrect (in the case of 20MHz channel width).
To fix this driver will check configuration from cfg and
will build it accordingly.

Signed-off-by: Meirav Kama 
Signed-off-by: Yaniv Machani 
---
V3 - Fixes redundant spaces,empty lines and added FALLTHROUGH note.

 net/mac80211/mesh.c | 33 -
 net/mac80211/util.c |  3 ---
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 9214bc1..6a67049 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -422,6 +422,7 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata,
enum nl80211_band band = ieee80211_get_sdata_band(sdata);
struct ieee80211_supported_band *sband;
u8 *pos;
+   u16 cap;
 
sband = local->hw.wiphy->bands[band];
if (!sband->ht_cap.ht_supported ||
@@ -430,11 +431,41 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data 
*sdata,
sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10)
return 0;
 
+   /* determine capability flags */
+   cap = sband->ht_cap.cap;
+
+   /* if channel width is 20MHz - configure HT capab accordingly*/
+   if (sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20) {
+   cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40;
+   cap &= ~IEEE80211_HT_CAP_DSSSCCK40;
+   }
+
+   /* set SM PS mode properly */
+   cap &= ~IEEE80211_HT_CAP_SM_PS;
+   switch (sdata->smps_mode) {
+   case IEEE80211_SMPS_AUTOMATIC:
+   case IEEE80211_SMPS_NUM_MODES:
+   WARN_ON(1);
+   /* FALLTHROUGH */
+   case IEEE80211_SMPS_OFF:
+   cap |= WLAN_HT_CAP_SM_PS_DISABLED <<
+   IEEE80211_HT_CAP_SM_PS_SHIFT;
+   break;
+   case IEEE80211_SMPS_STATIC:
+   cap |= WLAN_HT_CAP_SM_PS_STATIC <<
+   IEEE80211_HT_CAP_SM_PS_SHIFT;
+   break;
+   case IEEE80211_SMPS_DYNAMIC:
+   cap |= WLAN_HT_CAP_SM_PS_DYNAMIC <<
+   IEEE80211_HT_CAP_SM_PS_SHIFT;
+   break;
+   }
+
if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_cap))
return -ENOMEM;
 
pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_cap));
-   ieee80211_ie_build_ht_cap(pos, &sband->ht_cap, sband->ht_cap.cap);
+   ieee80211_ie_build_ht_cap(pos, &sband->ht_cap, cap);
 
return 0;
 }
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 42bf0b6..5375a82 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2349,10 +2349,7 @@ u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct 
ieee80211_sta_ht_cap *ht_cap,
ht_oper->operation_mode = cpu_to_le16(prot_mode);
ht_oper->stbc_param = 0x;
 
-   /* It seems that Basic MCS set and Supported MCS set
-  are identical for the first 10 bytes */
memset(&ht_oper->basic_set, 0, 16);
-   memcpy(&ht_oper->basic_set, &ht_cap->mcs, 10);
 
return pos + sizeof(struct ieee80211_ht_operation);
 }
-- 
2.9.0



[net-next PATCH 1/3] pktgen: add sample script pktgen_sample04_many_flows.sh

2016-07-13 Thread Jesper Dangaard Brouer
Adding a pktgen sample script that demonstrates how to use pktgen
for simulating flows.  Script will generate a certain number of
concurrent flows ($FLOWS) and each flow will contain $FLOWLEN
packets, which will be send back-to-back, before switching to a
new flow, due to flag FLOW_SEQ.

This script obsoletes the old sample script 'pktgen.conf-1-1-flows',
which is removed.

Signed-off-by: Jesper Dangaard Brouer 
---
 samples/pktgen/pktgen.conf-1-1-flows |   67 ---
 samples/pktgen/pktgen_sample04_many_flows.sh |   93 ++
 2 files changed, 93 insertions(+), 67 deletions(-)
 delete mode 100755 samples/pktgen/pktgen.conf-1-1-flows
 create mode 100755 samples/pktgen/pktgen_sample04_many_flows.sh

diff --git a/samples/pktgen/pktgen.conf-1-1-flows 
b/samples/pktgen/pktgen.conf-1-1-flows
deleted file mode 100755
index 081749c9707d..
--- a/samples/pktgen/pktgen.conf-1-1-flows
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/bin/bash
-
-#modprobe pktgen
-
-
-function pgset() {
-local result
-
-echo $1 > $PGDEV
-
-result=`cat $PGDEV | fgrep "Result: OK:"`
-if [ "$result" = "" ]; then
- cat $PGDEV | fgrep Result:
-fi
-}
-
-# Config Start Here ---
-
-
-# thread config
-# Each CPU has its own thread. One CPU example. We add eth1.
-
-PGDEV=/proc/net/pktgen/kpktgend_0
-  echo "Removing all devices"
- pgset "rem_device_all"
-  echo "Adding eth1"
- pgset "add_device eth1"
-
-
-# device config
-# delay 0
-# We need to do alloc for every skb since we cannot clone here.
-
-CLONE_SKB="clone_skb 0"
-# NIC adds 4 bytes CRC
-PKT_SIZE="pkt_size 60"
-
-# COUNT 0 means forever
-#COUNT="count 0"
-COUNT="count 1000"
-DELAY="delay 0"
-
-PGDEV=/proc/net/pktgen/eth1
-  echo "Configuring $PGDEV"
- pgset "$COUNT"
- pgset "$CLONE_SKB"
- pgset "$PKT_SIZE"
- pgset "$DELAY"
- # Random address with in the min-max range
- pgset "flag IPDST_RND"
- pgset "dst_min 10.0.0.0"
- pgset "dst_max 10.255.255.255"
-
- # 8k Concurrent flows at 4 pkts
- pgset "flows 8192"
- pgset "flowlen 4"
-
- pgset "dst_mac  00:04:23:08:91:dc"
-
-# Time to run
-PGDEV=/proc/net/pktgen/pgctrl
-
- echo "Running... ctrl^C to stop"
- trap true INT
- pgset "start"
- echo "Done"
- cat /proc/net/pktgen/eth1
diff --git a/samples/pktgen/pktgen_sample04_many_flows.sh 
b/samples/pktgen/pktgen_sample04_many_flows.sh
new file mode 100755
index ..f60412e445bb
--- /dev/null
+++ b/samples/pktgen/pktgen_sample04_many_flows.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+#
+# Script example for many flows testing
+#
+# Number of simultaneous flows limited by variable $FLOWS
+# and number of packets per flow controlled by variable $FLOWLEN
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+
+# Parameter parsing via include
+source ${basedir}/parameters.sh
+# Set some default params, if they didn't get set
+[ -z "$DEST_IP" ]   && DEST_IP="198.18.0.42"
+[ -z "$DST_MAC" ]   && DST_MAC="90:e2:ba:ff:ff:ff"
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
+
+# NOTICE:  Script specific settings
+# ===
+# Limiting the number of concurrent flows ($FLOWS)
+# and also set how many packets each flow contains ($FLOWLEN)
+#
+[ -z "$FLOWS" ] && FLOWS="8000"
+[ -z "$FLOWLEN" ]   && FLOWLEN="10"
+
+# Base Config
+DELAY="0"  # Zero means max speed
+COUNT="0"  # Zero means indefinitely
+
+if [[ -n "$BURST" ]]; then
+err 1 "Bursting not supported for this mode"
+fi
+
+# General cleanup everything since last run
+pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((thread = 0; thread < $THREADS; thread++)); do
+dev=${DEV}@${thread}
+
+# Add remove all other devices and add_device $dev to thread
+pg_thread $thread "rem_device_all"
+pg_thread $thread "add_device" $dev
+
+# Base config
+pg_set $dev "flag QUEUE_MAP_CPU"
+pg_set $dev "count $COUNT"
+pg_set $dev "clone_skb $CLONE_SKB"
+pg_set $dev "pkt_size $PKT_SIZE"
+pg_set $dev "delay $DELAY"
+pg_set $dev "flag NO_TIMESTAMP"
+
+# Single destination
+pg_set $dev "dst_mac $DST_MAC"
+pg_set $dev "dst $DEST_IP"
+
+# Randomize source IP-addresses
+pg_set $dev "flag IPSRC_RND"
+pg_set $dev "src_min 198.18.0.0"
+pg_set $dev "src_max 198.19.255.255"
+
+# Limit number of flows (max 65535)
+pg_set $dev "flows $FLOWS"
+#
+# How many packets a flow will send, before flow "entry" is
+# re-generated/setup.
+pg_set $dev "flowlen $FLOWLEN"
+#
+# Flag FLOW_SEQ will cause $FLOWLEN packets from the same flow
+# being send back-to-back, before next flow is selected
+# incrementally.  This helps lookup caches, and is more realistic.
+#
+pg_set $dev "flag FLOW_SEQ"
+
+done
+
+# Run if user hits control-c
+function print_result() {
+# Print results
+for ((thread = 0; thread < $THREADS; thread++)); do
+   dev=${DEV}@${thread}
+   echo "Device: $dev"
+  

[net-next PATCH 3/3] pktgen: remove sample script pktgen.conf-1-1-rdos

2016-07-13 Thread Jesper Dangaard Brouer
Removing the pktgen sample script pktgen.conf-1-1-rdos, because
it does not contain anything that is not covered by the other and
newer style sample scripts.

Signed-off-by: Jesper Dangaard Brouer 
---
 samples/pktgen/pktgen.conf-1-1-rdos |   64 ---
 1 file changed, 64 deletions(-)
 delete mode 100755 samples/pktgen/pktgen.conf-1-1-rdos

diff --git a/samples/pktgen/pktgen.conf-1-1-rdos 
b/samples/pktgen/pktgen.conf-1-1-rdos
deleted file mode 100755
index c7553be49b80..
--- a/samples/pktgen/pktgen.conf-1-1-rdos
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-#modprobe pktgen
-
-
-function pgset() {
-local result
-
-echo $1 > $PGDEV
-
-result=`cat $PGDEV | fgrep "Result: OK:"`
-if [ "$result" = "" ]; then
- cat $PGDEV | fgrep Result:
-fi
-}
-
-# Config Start Here ---
-
-
-# thread config
-# Each CPU has its own thread. One CPU example. We add eth1.
-
-PGDEV=/proc/net/pktgen/kpktgend_0
-  echo "Removing all devices"
- pgset "rem_device_all"
-  echo "Adding eth1"
- pgset "add_device eth1"
-
-
-# device config
-# delay 0
-
-# We need to do alloc for every skb since we cannot clone here.
-
-CLONE_SKB="clone_skb 0"
-# NIC adds 4 bytes CRC
-PKT_SIZE="pkt_size 60"
-
-# COUNT 0 means forever
-#COUNT="count 0"
-COUNT="count 1000"
-DELAY="delay 0"
-
-PGDEV=/proc/net/pktgen/eth1
-  echo "Configuring $PGDEV"
- pgset "$COUNT"
- pgset "$CLONE_SKB"
- pgset "$PKT_SIZE"
- pgset "$DELAY"
- # Random address with in the min-max range
- pgset "flag IPDST_RND"
- pgset "dst_min 10.0.0.0"
- pgset "dst_max 10.255.255.255"
-
- pgset "dst_mac  00:04:23:08:91:dc"
-
-# Time to run
-PGDEV=/proc/net/pktgen/pgctrl
-
- echo "Running... ctrl^C to stop"
- trap true INT
- pgset "start"
- echo "Done"
- cat /proc/net/pktgen/eth1



RE: [PATCH 1/4] mac80211: mesh: flush stations before beacons are stopped

2016-07-13 Thread Machani, Yaniv
On Wed, Jul 13, 2016 at 16:33:38, Bob Copeland wrote:
> linux- wirel...@vger.kernel.org; netdev@vger.kernel.org; Hahn, Maital
> Subject: Re: [PATCH 1/4] mac80211: mesh: flush stations before beacons 
> are stopped
> 
> On Wed, Jul 13, 2016 at 10:11:25AM +, Machani, Yaniv wrote:
> > > > Some drivers (e.g. wl18xx) expect that the last stage in the 
> > > > de-initialization process will be stopping the beacons, similar to ap.
> > > > Update ieee80211_stop_mesh() flow accordingly.
> > > >
> > > How well have you tested that with other drivers?
> > >
> >
> > Sorry for the delayed response (I've been out) and thanks for your 
> > comments, I have tested it with RT3572 as well, and didn't see any issue.
> > I'll update the comment to reflect that.
> 
> I'll give this a test on ath10k and wcn36xx as they are the ones most 
> likely to care about ordering.
> 

Thank you,
Yaniv
> --
> Bob Copeland %% http://bobcopeland.com/




Re: linux-next: Tree for Jul 13 (net/core/devlink with Tracing)

2016-07-13 Thread Steven Rostedt
On Wed, 13 Jul 2016 08:12:16 -0700
Randy Dunlap  wrote:

> On 07/12/16 23:47, Stephen Rothwell wrote:
> > Hi all,
> > 
> > Changes since 20160712:
> >   
> 
> on x86_64:
> (full randconfig file is attached)
> 
> 
>   CC  net/core/devlink.o
> In file included from ../include/trace/define_trace.h:95:0,
>  from ../include/trace/events/devlink.h:51,
>  from ../net/core/devlink.c:30:
> ../include/trace/events/devlink.h: In function 
> 'trace_event_get_offsets_devlink_hwmsg':
> ../include/trace/events/devlink.h:25:51: error: dereferencing pointer to 
> incomplete type
>__string(owner_name, devlink->dev->driver->owner->name)

[snip the rest]

When I remove all references to the owner_name (and that crazy
dereferencing above), it compiles fine. There must be something funky
with that devlink->dev->driver->owner->name part.

-- Steve




Re: [PATCH v7 09/11] net/mlx4_en: add xdp forwarding and data write support

2016-07-13 Thread Saeed Mahameed
On Wed, Jul 13, 2016 at 8:30 PM, Brenden Blanco  wrote:
> On Wed, Jul 13, 2016 at 06:25:28PM +0300, Saeed Mahameed wrote:
>> On Tue, Jul 12, 2016 at 12:29 AM, Brenden Blanco  
>> wrote:
[...]
>>
>> MAX_TX_RING is a software limitation made to limit netdev real_num_tx
>> queues for CX3 internal cache utilization,
>> in your case the netdev doesn't care about xdp_tx rings, the
>> accounting you added in this patch adds a  lot of
>> complications and it would be better to have clear separation between
>> the two types of tx_rings, in terms of the holding/managing data
>> structure.
>>
>> I suggest here to leave priv->tx_ring untouched. i.e, don't store the
>> xdp_tx rings at the end of it, i.e  priv->tx_ring should only reflect
>> the
>> netdev real tx queues.
>>
>> In case of priv->porg is active, we can allocate and store xdp tx ring
>> per rx ring, this tx ring will be allocated and activated
>> once the rx ring is created and activated, and store this dedicated tx
>> ring  in the rx_ring it self.
>>
>> i.e :
>> struct mlx4_en_rx_ring {
>> [...]
>> struct mlx4_en_tx_ring *xdp_tx;
>> struct mlx4_en_cq *xdp_tx_cq;
>> [...]
>> }
>>
>> for this the following changes are required.
>>
>> @ mlx4_en_create_rx_ring
>> [...] // Create the RX ring
>>
>> /* create CQ for xdp tx ring */
>> node = cpu_to_node(i % num_online_cpus());
>>
>> mlx4_en_create_cq(priv, &rx_ring->xdp_tx_cq, prof->tx_ring_size, i, TX, node)
>>
>> /* create xdp tx ring */
>> mlx4_en_create_tx_ring(priv, &rx_ring->xdp_tx, prof->tx_ring_size,
>> TXBB_SIZE, node, -1)
>>
>> @mlx4_en_start/stop_port
>> /* Configure tx cq's and rings */
>> // You will need to configure xdp tx rings same as priv->rx_ring_num rings
>>
>> @mlx4_en_poll_tx_cq
>> This Also will require a new NAPI handler for xdp rings to replace the
>> following line @mlx4_en_poll_tx_cq
>> - struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
>> with
>> + struct mlx4_en_tx_ring *ring = priv->rx_ring[cq->ring].xdp_tx;
>>
>> Or just change cq->ring from ring index to the actual ring pointer.
>>
>> Bottom line, my suggestion also started to look complicated :).. but
>> still it would look cleaner to separate between netdev rings and xdp
>> rings.
>>
> I considered this at first too, but it seemed the worse option to me at
> the time. There would be a lot of copy/paste as well as new code to
> review.

We can start from a small refactoring patch that moves code around and
extracts the needed helper functions. But it is up to you and Tariq.

it is really non trivial to follow the logic of rsv_tx_rings and
tx_ring_num accounting.

>>
>> If in this napi cycle we had at least one packet that went through
>> XDP_PASS (mlx4_en_xmit_frame) you must hit doorbell here,
> You mean XDP_TX?

yes

>> otherwise if no packet will be forwarded later, this packet will never
>> be really sent and it will wait in HW forever.
>>
>> The idea is correct to hit the doorbell only at the end of
>> mlx4_en_process_rx_cq cycle to batch tx descriptors and send them in
>> one batch,
> Up to a budget of 8
>> but you must hit doorbell at the end of the cycle. you can't just
>> assume more RX packets will come in the future to hit the doorbell for
>> you.
> I don't assume that. If you look at the code, either:
> xmit_frame rings the doorbell, in which case doorbell_pending <- 0
> or
> xmit_frame doesn't ring the doorbell, in which case doorbell_pending++
> So how is a packet left in the ring unannounced?

Ooh, now i see, yeap the logic is good.

>>
>> This condition will be checked always even for non XDP rings and when
>> XDP is not enabled.
>> can't we just figure a way not to have this for non XDP rings ?
>> other than having a separate napi handler i don't see a way to do this.
>> on the other hand, new NAPI handler would introduce a lot of code 
>> duplication.
> Yes I considered a separate napi handler, but again that would be more
> code duplication than it's worth, IMO.

Yeah, I agree.

>>
>> > +
>> > +   skb = tx_info->skb;
>> > +
>> > /* We do not touch skb here, so prefetch skb->users location
>> >  * to speedup consume_skb()
>> >  */
>> > @@ -476,6 +494,9 @@ static bool mlx4_en_process_tx_cq(struct net_device 
>> > *dev,
>> > ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
>> > ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
>> >
>> > +   if (ring->recycle_ring)
>> > +   return done < budget;
>> > +
>> > netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
>> >
>> > /* Wakeup Tx queue if this stopped, and ring is not full.
>> > @@ -1055,3 +1076,106 @@ tx_drop:
>> > return NETDEV_TX_OK;
>> >  }
>> >
>> > +netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_alloc *frame,
>> > +  struct net_device *dev, unsigned int length,
>> > +  int tx_ind, int *doorbell_pending)
>> > +{
>> > +   struct mlx4_en_priv *priv = netdev_priv(dev);
>> > +   union mlx4_wqe_qpn_vlan

Re: [PATCH v2 net-next] net: vrf: Documentation update

2016-07-13 Thread David Miller
From: David Ahern 
Date: Wed, 13 Jul 2016 06:19:37 -0600

> Update vrf documentation for changes made to 4.4 - 4.8 kernels
> and iproute2 support for vrf keyword.
> 
> Signed-off-by: David Ahern 
> ---
> v2
> - comments from Frank Kellerman: extra whitespace in front of a neigh show
>   command. Convert the brief link example to 'vrf red'.

Oops, I applied v1 already, please send me a relative patch with the
changes.

Thanks.


Re: [PATCH net 0/2] limit sk_filter trim to payload

2016-07-13 Thread David Miller
From: Willem de Bruijn 
Date: Tue, 12 Jul 2016 18:18:55 -0400

> From: Willem de Bruijn 
> 
> Sockets can apply a filter to incoming packets to drop or trim them.
> Fix two codepaths that call skb_pull/__skb_pull after sk_filter
> without checking for packet length.
> 
> Reading beyond skb->tail after trimming happens in more codepaths, but
> safety of reading in the linear segment is based on minimum allocation
> size (MAX_HEADER, GRO_MAX_HEAD, ..).

Series applied and queued up for -stable, thanks.


Re: [PATCH v2 net] tcp: make challenge acks less predictable

2016-07-13 Thread Yue Cao
I see your point and I agree with you that SSL protects victims from
this hijacking attack, especially with full HSTS.

For Windows case, since Windows is a black box for us, we tested its
Challenge ACK mechanism with Windows Server 2012 R2 Base and Windows
Server 2008 R2 from Amazon EC2. The results show that Windows also add
some strategies to mitigate blind in-window attack problem, but the
mitigated results are not as same as what mentioned in RFC 5961.

Please let me know if I said something wrong. Thanks for the fix!

Best,
Yue

On Mon, Jul 11, 2016 at 1:02 AM, Eric Dumazet  wrote:
> On Sun, 2016-07-10 at 11:28 -0700, Yue Cao wrote:
>> This second patch does make our attack much harder but it's still
>> possible to do such off-path attack with enough network bandwidth.
>> Here is our modified attack for this second patch.
>>
>> Modified Attack:
>> Main idea of our attack is to send multiple same spoofed packets in 1
>> second so attacker can confirm if it's a right guess or wrong guess.
>> In more detail, attacker sends more than 1000 (e.g. 1500) spoofed
>> packets for a same guessed value at beginning. After that, attacker
>> sends 1500 packets during the same second to determine whether
>> previous guess is right or wrong, by using following rules:
>> If attacker receives less than 500 Challenge ACKs, it's a right guess.
>> For a example, if 1500 spoofed packets are sent with a correct
>> value(right guess), all Challenge ACKs will be sent to victim client
>> in that second and attacker receives nothing. Otherwise, it's a wrong
>> guess.
>>
>> Since this global rate limit always leaks some information as a
>> side-channel, we are wondering if eliminating it completely would be a
>> good idea. In fact, according to our latest test, FreeBSD and Windows
>> do not have any such rate limit implemented. Looking forward to your
>> replies.
>
> Are you sure Windows is implementing RFC 5961 ? Linux got in in 3.6.
>
> We do want RFC 5961, compared to the small nuisance of the attack you
> describe.
>
> Nuisance of having a way for hackers to send a RST packet after
> consuming thousands of probe packets is nothing, compared to the
> nuisance of ACK storms we had before rate limiting was added in 3.6 (and
> refined in 4.0). This was a serious problem for real servers, because of
> buggy firewalls and appliances.
>
> You probably know that if someone worries about TCP flows being
> compromised, it should use SSL, so that traffic injection is less likely
> to happen.
>
> Most TCP flows in the Internet are short lived (less than 1 minute).
>
> Having to establish about 500 flows to the victim is already a
> challenge, since the victim would already be in trouble if it was
> allowing so many idle flows.
>
> So the 'solution' would be to backport
> f2b2c582e82429270d5818fbabe653f4359d7024
> ("tcp: mitigate ACK loops for connections as tcp_sock")
>
> Then apply the v2 patch so that the limit is randomized.
>
> Then set the default limit to 2^31
>
>
>


Re: [PATCH net-next 00/10] Mellanox 100G mlx5 Bulk flow statistics and SRIOV TC offloads

2016-07-13 Thread David Miller
From: David Miller 
Date: Wed, 13 Jul 2016 11:46:25 -0700 (PDT)

> Series applied, thanks.

Actually, I have to revert.  Please fix this build warning and resubmit,
thanks.

drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c: In function 
‘mlx5_fc_stats_work’:
drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c:191:48: warning: ‘last’ 
may be used uninitialized in this function [-Wmaybe-uninitialized]
   node = mlx5_fc_stats_query(dev, counter, last->id);
^


[iproute PATCH v4 4/5] No need to initialize rtattr fields before parsing

2016-07-13 Thread Phil Sutter
Since parse_rtattr_flags() calls memset already, there is no need for
callers to do so themselves.

Signed-off-by: Phil Sutter 
---
 ip/ipaddress.c | 2 +-
 tc/tc_class.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index cfcebe76af399..60862c5700330 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -449,7 +449,7 @@ static void print_num(FILE *fp, unsigned int width, 
uint64_t count)
 
 static void print_vf_stats64(FILE *fp, struct rtattr *vfstats)
 {
-   struct rtattr *vf[IFLA_VF_STATS_MAX + 1] = {};
+   struct rtattr *vf[IFLA_VF_STATS_MAX + 1];
 
if (vfstats->rta_type != IFLA_VF_STATS) {
fprintf(stderr, "BUG: rta type is %d\n", vfstats->rta_type);
diff --git a/tc/tc_class.c b/tc/tc_class.c
index 1690ec1bbfad8..f3864d22f3c4d 100644
--- a/tc/tc_class.c
+++ b/tc/tc_class.c
@@ -219,7 +219,7 @@ static void graph_cls_show(FILE *fp, char *buf, struct 
hlist_head *root_list,
 {
struct hlist_node *n, *tmp_cls;
char cls_id_str[256] = {};
-   struct rtattr *tb[TCA_MAX + 1] = {};
+   struct rtattr *tb[TCA_MAX + 1];
struct qdisc_util *q;
char str[100] = {};
 
@@ -304,7 +304,7 @@ int print_class(const struct sockaddr_nl *who,
FILE *fp = (FILE *)arg;
struct tcmsg *t = NLMSG_DATA(n);
int len = n->nlmsg_len;
-   struct rtattr *tb[TCA_MAX + 1] = {};
+   struct rtattr *tb[TCA_MAX + 1];
struct qdisc_util *q;
char abuf[256];
 
-- 
2.8.2



[iproute PATCH v4 2/5] Use C99 style initializers everywhere

2016-07-13 Thread Phil Sutter
This big patch was compiled by vimgrepping for memset calls and changing
to C99 initializer if applicable. One notable exception is the
initialization of union bpf_attr in tc/tc_bpf.c: changing it would break
for older gcc versions (at least <=3.4.6).

Calls to memset for struct rtattr pointer fields for parse_rtattr*()
were just dropped since they are not needed.

The changes here allowed the compiler to discover some unused variables,
so get rid of them, too.

Signed-off-by: Phil Sutter 
---
Changes since v3:
- Use empty instead of zero initializer.
Changes since v2:
- Flatten initializers.
- Leave a final comma in place.
- Fix checkpatch warnings.
- Initialize nlmsg_seq in the declaration, too.
- Use C99-style init in tc_bpf.c to get rid of the memset().
Changes since v1:
- Dropped former changes to tc/tc_bpf.c as they are incompatible to older
  gcc versions (at least <=3.4.6).
---
 bridge/fdb.c |  25 ++---
 bridge/link.c|  14 +++
 bridge/mdb.c |  17 -
 bridge/vlan.c|  17 -
 genl/ctrl.c  |  44 +-
 ip/ip6tunnel.c   |  10 ++---
 ip/ipaddress.c   |  31 +++-
 ip/ipaddrlabel.c |  21 ---
 ip/iplink.c  |  61 +-
 ip/iplink_can.c  |   4 +-
 ip/ipmaddr.c |  25 -
 ip/ipmroute.c|   8 +---
 ip/ipneigh.c |  30 ++-
 ip/ipnetconf.c   |  10 ++---
 ip/ipnetns.c |  39 +---
 ip/ipntable.c|  25 -
 ip/iproute.c |  78 ++-
 ip/iprule.c  |  22 +--
 ip/iptoken.c |  19 --
 ip/iptunnel.c|  31 +---
 ip/ipxfrm.c  |  26 -
 ip/link_gre.c|  18 -
 ip/link_gre6.c   |  18 -
 ip/link_ip6tnl.c |  25 +
 ip/link_iptnl.c  |  22 +--
 ip/link_vti.c|  18 -
 ip/link_vti6.c   |  18 -
 ip/xfrm_policy.c |  99 -
 ip/xfrm_state.c  | 110 +++
 lib/libnetlink.c |  77 ++
 lib/ll_map.c |   1 -
 misc/arpd.c  |  64 ++--
 misc/ss.c|  37 +++
 tc/e_bpf.c   |   7 +---
 tc/em_cmp.c  |   4 +-
 tc/em_ipset.c|   4 +-
 tc/em_meta.c |   4 +-
 tc/em_nbyte.c|   4 +-
 tc/em_u32.c  |   4 +-
 tc/f_flow.c  |   3 --
 tc/f_flower.c|   3 +-
 tc/f_fw.c|   6 +--
 tc/f_route.c |   3 --
 tc/f_rsvp.c  |   6 +--
 tc/f_u32.c   |  12 ++
 tc/m_bpf.c   |   5 +--
 tc/m_csum.c  |   4 +-
 tc/m_ematch.c|   4 +-
 tc/m_gact.c  |   5 +--
 tc/m_ife.c   |   5 +--
 tc/m_mirred.c|   7 +---
 tc/m_nat.c   |   4 +-
 tc/m_pedit.c |   8 +---
 tc/m_police.c|   5 +--
 tc/q_atm.c   |   3 +-
 tc/q_cbq.c   |  22 +++
 tc/q_choke.c |   4 +-
 tc/q_codel.c |   3 +-
 tc/q_dsmark.c|   1 -
 tc/q_fifo.c  |   4 +-
 tc/q_fq_codel.c  |   3 +-
 tc/q_hfsc.c  |  13 ++-
 tc/q_htb.c   |  15 +++-
 tc/q_netem.c |  16 +++-
 tc/q_red.c   |   4 +-
 tc/q_sfb.c   |  17 -
 tc/q_sfq.c   |   4 +-
 tc/q_tbf.c   |   4 +-
 tc/tc_bpf.c  |  54 ++-
 tc/tc_class.c|  31 ++--
 tc/tc_exec.c |   3 +-
 tc/tc_filter.c   |  33 ++---
 tc/tc_qdisc.c|  33 ++---
 tc/tc_stab.c |   4 +-
 tc/tc_util.c |   3 +-
 75 files changed, 532 insertions(+), 913 deletions(-)

diff --git a/bridge/fdb.c b/bridge/fdb.c
index be849f980a802..59538b1e16506 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -177,16 +177,15 @@ static int fdb_show(int argc, char **argv)
struct nlmsghdr n;
struct ifinfomsgifm;
charbuf[256];
-   } req;
+   } req = {
+   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+   .ifm.ifi_family = PF_BRIDGE,
+   };
 
char *filter_dev = NULL;
char *br = NULL;
int msg_size = sizeof(struct ifinfomsg);
 
-   memset(&req, 0, sizeof(req));
-   req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
-   req.ifm.ifi_family = PF_BRIDGE;
-
while (argc > 0) {
if ((strcmp(*argv, "brport") == 0) || strcmp(*argv, "dev") == 
0) {
NEXT_ARG();
@@ -247,7 +246,13 @@ static int fdb_modify(int cmd, int flags, int argc, char 
**argv)
struct nlmsghdr n;
struct ndmsgndm;
charbuf[256];
-   } req;
+   } req = {
+   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
+   .n.nlmsg_flags = NLM_F_REQUEST | flags,
+   .n.nlmsg_type = cmd,
+   .ndm.ndm_family = PF_BRIDGE,
+   .ndm.ndm_state = NUD_NOARP,
+   };
char *addr = NULL;
char *

[iproute PATCH v4 1/5] tc: m_action: Improve conversion to C99 style initializers

2016-07-13 Thread Phil Sutter
This improves my initial change in the following points:

- Flatten embedded struct's initializers.
- No need to initialize variables to zero as the key feature of C99
  initializers is to do this implicitly.
- By relocating the declaration of struct rtattr *tail, it can be
  initialized at the same time.

Fixes: a0a73b298a579 ("tc: m_action: Use C99 style initializers for struct req")
Signed-off-by: Phil Sutter 
---
Changes since v2:
- Don't drop the "superfluous" comma.
- Flatten initializers.
Changes since v1:
- Created this patch.
---
 tc/m_action.c | 23 +++
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index ea16817aefd4f..806fdd197965d 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -395,13 +395,10 @@ static int tc_action_gd(int cmd, unsigned int flags, int 
*argc_p, char ***argv_p
struct tcamsg   t;
charbuf[MAX_MSG];
} req = {
-   .n = {
-   .nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .nlmsg_flags = NLM_F_REQUEST | flags,
-   .nlmsg_type = cmd,
-   },
+   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
+   .n.nlmsg_flags = NLM_F_REQUEST | flags,
+   .n.nlmsg_type = cmd,
.t.tca_family = AF_UNSPEC,
-   .buf = { 0 }
};
 
argc -= 1;
@@ -491,23 +488,18 @@ static int tc_action_modify(int cmd, unsigned int flags, 
int *argc_p, char ***ar
int argc = *argc_p;
char **argv = *argv_p;
int ret = 0;
-
-   struct rtattr *tail;
struct {
struct nlmsghdr n;
struct tcamsg   t;
charbuf[MAX_MSG];
} req = {
-   .n = {
-   .nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .nlmsg_flags = NLM_F_REQUEST | flags,
-   .nlmsg_type = cmd,
-   },
+   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
+   .n.nlmsg_flags = NLM_F_REQUEST | flags,
+   .n.nlmsg_type = cmd,
.t.tca_family = AF_UNSPEC,
-   .buf = { 0 }
};
+   struct rtattr *tail = NLMSG_TAIL(&req.n);
 
-   tail = NLMSG_TAIL(&req.n);
argc -= 1;
argv += 1;
if (parse_action(&argc, &argv, TCA_ACT_TAB, &req.n)) {
@@ -540,7 +532,6 @@ static int tc_act_list_or_flush(int argc, char **argv, int 
event)
} req = {
.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
.t.tca_family = AF_UNSPEC,
-   .buf = { 0 }
};
 
tail = NLMSG_TAIL(&req.n);
-- 
2.8.2



[iproute PATCH v4 5/5] Makefile: Allow to override CC

2016-07-13 Thread Phil Sutter
This makes it easier to build iproute2 with a custom compiler.

While at it, make HOSTCC default to the value of CC if not explicitly
set elsewhere.

Signed-off-by: Phil Sutter 
---
 Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 15c81ecfdca3a..fa200ddb76679 100644
--- a/Makefile
+++ b/Makefile
@@ -34,8 +34,8 @@ ADDLIB+=ipx_ntop.o ipx_pton.o
 #options for mpls
 ADDLIB+=mpls_ntop.o mpls_pton.o
 
-CC = gcc
-HOSTCC = gcc
+CC := gcc
+HOSTCC ?= $(CC)
 DEFINES += -D_GNU_SOURCE
 # Turn on transparent support for LFS
 DEFINES += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
-- 
2.8.2



[iproute PATCH v4 3/5] Replace malloc && memset by calloc

2016-07-13 Thread Phil Sutter
This only replaces occurrences where the newly allocated memory is
cleared completely afterwards, as in other cases it is a theoretical
performance hit although code would be cleaner this way.

Signed-off-by: Phil Sutter 
---
Changes since v2:
- Fix checkpatch errors.
---
 genl/genl.c|  3 +--
 lib/names.c|  7 ++-
 misc/lnstat.c  |  6 ++
 misc/lnstat_util.c |  4 +---
 tc/em_canid.c  |  4 ++--
 tc/m_action.c  |  3 +--
 tc/m_ipt.c | 13 -
 tc/m_pedit.c   |  3 +--
 tc/tc.c|  9 +++--
 tc/tc_bpf.c|  4 +---
 tc/tc_class.c  |  3 +--
 tc/tc_exec.c   |  3 +--
 12 files changed, 20 insertions(+), 42 deletions(-)

diff --git a/genl/genl.c b/genl/genl.c
index e33fafdf2f524..747074b029a7b 100644
--- a/genl/genl.c
+++ b/genl/genl.c
@@ -86,9 +86,8 @@ reg:
return f;
 
 noexist:
-   f = malloc(sizeof(*f));
+   f = calloc(1, sizeof(*f));
if (f) {
-   memset(f, 0, sizeof(*f));
strncpy(f->name, str, 15);
f->parse_genlopt = parse_nofopt;
f->print_genlopt = print_nofopt;
diff --git a/lib/names.c b/lib/names.c
index 3b5b0b1e1201a..fbd6503f22d42 100644
--- a/lib/names.c
+++ b/lib/names.c
@@ -54,15 +54,12 @@ struct db_names *db_names_alloc(void)
 {
struct db_names *db;
 
-   db = malloc(sizeof(*db));
+   db = calloc(1, sizeof(*db));
if (!db)
return NULL;
 
-   memset(db, 0, sizeof(*db));
-
db->size = MAX_ENTRIES;
-   db->hash = malloc(sizeof(struct db_entry *) * db->size);
-   memset(db->hash, 0, sizeof(struct db_entry *) * db->size);
+   db->hash = calloc(db->size, sizeof(struct db_entry *));
 
return db;
 }
diff --git a/misc/lnstat.c b/misc/lnstat.c
index 659a01bd69931..863fd4d9f03f2 100644
--- a/misc/lnstat.c
+++ b/misc/lnstat.c
@@ -182,10 +182,8 @@ static struct table_hdr *build_hdr_string(struct 
lnstat_file *lnstat_files,
static struct table_hdr th;
int ofs = 0;
 
-   for (i = 0; i < HDR_LINES; i++) {
-   th.hdr[i] = malloc(HDR_LINE_LENGTH);
-   memset(th.hdr[i], 0, HDR_LINE_LENGTH);
-   }
+   for (i = 0; i < HDR_LINES; i++)
+   th.hdr[i] = calloc(1, HDR_LINE_LENGTH);
 
for (i = 0; i < fps->num; i++) {
char *cname, *fname = fps->params[i].lf->name;
diff --git a/misc/lnstat_util.c b/misc/lnstat_util.c
index d918151282f55..cc54598fe1bef 100644
--- a/misc/lnstat_util.c
+++ b/misc/lnstat_util.c
@@ -173,15 +173,13 @@ static struct lnstat_file *alloc_and_open(const char 
*path, const char *file)
struct lnstat_file *lf;
 
/* allocate */
-   lf = malloc(sizeof(*lf));
+   lf = calloc(1, sizeof(*lf));
if (!lf) {
fprintf(stderr, "out of memory\n");
return NULL;
}
 
/* initialize */
-   memset(lf, 0, sizeof(*lf));
-
/* de->d_name is guaranteed to be <= NAME_MAX */
strcpy(lf->basename, file);
strcpy(lf->path, path);
diff --git a/tc/em_canid.c b/tc/em_canid.c
index 16f6ed5c0b7a4..ceb64cb933f51 100644
--- a/tc/em_canid.c
+++ b/tc/em_canid.c
@@ -106,8 +106,8 @@ static int canid_parse_eopt(struct nlmsghdr *n, struct 
tcf_ematch_hdr *hdr,
if (args == NULL)
return PARSE_ERR(args, "canid: missing arguments");
 
-   rules.rules_raw = malloc(sizeof(struct can_filter) * 
rules.rules_capacity);
-   memset(rules.rules_raw, 0, sizeof(struct can_filter) * 
rules.rules_capacity);
+   rules.rules_raw = calloc(rules.rules_capacity,
+sizeof(struct can_filter));
 
do {
if (!bstrcmp(args, "sff")) {
diff --git a/tc/m_action.c b/tc/m_action.c
index 806fdd197965d..24f8b5d855211 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -126,9 +126,8 @@ noexist:
goto restart_s;
}
 #endif
-   a = malloc(sizeof(*a));
+   a = calloc(1, sizeof(*a));
if (a) {
-   memset(a, 0, sizeof(*a));
strncpy(a->id, "noact", 15);
a->parse_aopt = parse_noaopt;
a->print_aopt = print_noaopt;
diff --git a/tc/m_ipt.c b/tc/m_ipt.c
index 098f610f9439a..d6f62bd6b32c9 100644
--- a/tc/m_ipt.c
+++ b/tc/m_ipt.c
@@ -164,16 +164,11 @@ get_target_name(const char *name)
return NULL;
 #endif
 
-   new_name = malloc(strlen(name) + 1);
-   lname = malloc(strlen(name) + 1);
-   if (new_name)
-   memset(new_name, '\0', strlen(name) + 1);
-   else
+   new_name = calloc(1, strlen(name) + 1);
+   lname = calloc(1, strlen(name) + 1);
+   if (!new_name)
exit_error(PARAMETER_PROBLEM, "get_target_name");
-
-   if (lname)
-   memset(lname, '\0', strlen(name) + 1);
-   else
+   if (!lname)
exit_error(PARAMETER_PROBLEM, "get_target_name");
 
strcpy(new_name, name);
diff --git a/tc/m_ped

Re: [PATCH net-next 00/10] Mellanox 100G mlx5 Bulk flow statistics and SRIOV TC offloads

2016-07-13 Thread David Miller
From: Saeed Mahameed 
Date: Wed, 13 Jul 2016 00:28:56 +0300

> This series from Amir and Or deals with two enhancements for the mlx5 TC 
> offloads.
> 
> The 1st two patches add bulk reading of flow counters. Few bulk counter 
> queries are
> used instead of issuing thousands firmware commands per second to get 
> statistics of all
> flows set to HW.
> 
> The next patches add TC based SRIOV offloading to mlx5, as a follow up for 
> the e-switch
> offloads mode and the VF representors. When the e-switch is set to the (new) 
> "offloads"
> mode, we can now offload TC/flower drop and forward rules, the forward action 
> we offload
> is TC mirred/redirect.
> 
> The above is done by the VF representor netdevices exporting the setup_tc ndo 
> where from
> there we're re-using and enhancing the existing mlx5 TC offloads sub-module 
> which now
> works for both the NIC and the SRIOV cases.
> 
> The series is applied on top d3fc0353f7c7 ('ipv4: af_inet: make it explicitly 
> non-modular')
> and it has no merge issues with the on-going net submission ('mlx5 tx timeout 
> watchdog fixes')

Series applied, thanks.


[iproute PATCH v4 0/5] Big C99 style initializer rework

2016-07-13 Thread Phil Sutter
This is v4 of my C99-style initializer related patch series. The changes
since v3 are:

- Use empty initializer instead of the universal zero initializer:
  The latter one triggers warnings in older GCCs, and this appears to
  be the least intrusive workaround. Plus, empty initializers are used
  all over the code already, so it won't make things worse. (GCC in
  pedantic mode does not like them, but that is a can of worms by
  itself.)

- Dropped patch 6 (unsigned value comparison simplification):
  It unintendedly changes that comparison's semantics, and I am not
  completely sure the change is correct - therefore rather leave it as
  is.

- Rebased onto current origin/master again (no conflicts).

For reference, here's the v3 changelog:

- Flattened embedded struct's initializers:
  Since the field names are very short, I figured it makes more sense to
  keep indenting low. Also, the same style is already used in
  ip/xfrm_policy.c so take that as an example.

- Moved leftover nlmsg_seq initializing into the common place as well:
  I was unsure whether this is a good idea at first (due to the
  increment), but again it's done in ip/xfrm_policy.c as well so should
  be fine.

- Added a comma after the last field initializer as suggested by Jakub.

- Dropped patch 7 since it was NACKed.

- Eliminated checkpatch non-compliance.

- Second go at union bpf_attr in tc/tc_bpf.c:
  I figured that while it is not possible to initialize fields, gcc-3.4.6
  does not complain when setting the whole union to zero using '= {0}'.
  So I did this and thereby at least got rid of the memset calls.

For reference, here's the v2 changelog:

- Rebased onto current upstream master:
  My own commit a0a73b298a579 ("tc: m_action: Use C99 style initializers
  for struct req") contains most of the changes to tc/m_action.c already,
  so I put the remaining ones into a dedicated patch (the first one here)
  with a better description.

- Tested against gcc-3.4.6:
  This is the oldest gcc version I was able to install locally. It indeed
  does not like the former changes in tc/tc_bpf.c, so I reverted them.
  Apart from emitting many warnings, it successfully compiles the
  sources.

In the process of compatibility testing, I made a few more changes which
make sense to have:

- New patch 5 allows to conveniently override the compiler via command
  line.

- New patch 6 eliminates a warning with old gcc but looks valid in
  general.

- A warning made me look at ip/tcp_metrics.c and I found a minor code
  simplification (patch 7).

Phil Sutter (5):
  tc: m_action: Improve conversion to C99 style initializers
  Use C99 style initializers everywhere
  Replace malloc && memset by calloc
  No need to initialize rtattr fields before parsing
  Makefile: Allow to override CC

 Makefile   |   4 +-
 bridge/fdb.c   |  25 ++--
 bridge/link.c  |  14 +++
 bridge/mdb.c   |  17 -
 bridge/vlan.c  |  17 -
 genl/ctrl.c|  44 +
 genl/genl.c|   3 +-
 ip/ip6tunnel.c |  10 ++---
 ip/ipaddress.c |  33 +++-
 ip/ipaddrlabel.c   |  21 --
 ip/iplink.c|  61 -
 ip/iplink_can.c|   4 +-
 ip/ipmaddr.c   |  25 
 ip/ipmroute.c  |   8 +---
 ip/ipneigh.c   |  30 ++-
 ip/ipnetconf.c |  10 ++---
 ip/ipnetns.c   |  39 +--
 ip/ipntable.c  |  25 
 ip/iproute.c   |  78 +
 ip/iprule.c|  22 +--
 ip/iptoken.c   |  19 -
 ip/iptunnel.c  |  31 +--
 ip/ipxfrm.c|  26 -
 ip/link_gre.c  |  18 -
 ip/link_gre6.c |  18 -
 ip/link_ip6tnl.c   |  25 +---
 ip/link_iptnl.c|  22 +--
 ip/link_vti.c  |  18 -
 ip/link_vti6.c |  18 -
 ip/xfrm_policy.c   |  99 +++
 ip/xfrm_state.c| 110 ++---
 lib/libnetlink.c   |  77 ++---
 lib/ll_map.c   |   1 -
 lib/names.c|   7 +---
 misc/arpd.c|  64 ++-
 misc/lnstat.c  |   6 +--
 misc/lnstat_util.c |   4 +-
 misc/ss.c  |  37 +++---
 tc/e_bpf.c |   7 +---
 tc/em_canid.c  |   4 +-
 tc/em_cmp.c|   4 +-
 tc/em_ipset.c  |   4 +-
 tc/em_meta.c   |   4 +-
 tc/em_nbyte.c  |   4 +-
 tc/em_u32.c|   4 +-
 tc/f_flow.c|   3 --
 tc/f_flower.c  |   3 +-
 tc/f_fw.c  |   6 +--
 tc/f_route.c   |   3 --
 tc/f_rsvp.c|   6 +--
 tc/f_u32.c |  12 ++
 tc/m_action.c  |  26 -
 tc/m_bpf.c |   5 +--
 tc/m_csum.c|   4 +-
 tc/m_ematch.c  |   4 +-
 tc/m_gact.c|   5 +--
 tc/m_ife.c |   5 +--
 tc/m_ipt.c |  13 ++-
 tc/m_mirred.c  |   7 +---
 tc/m_nat.c  

Re: [PATCH net 0/2] mlx5 tx timeout watchdog fixes

2016-07-13 Thread David Miller
From: Saeed Mahameed 
Date: Wed, 13 Jul 2016 00:06:58 +0300

> This patch set provides two trivial fixes for the tx timeout series lately 
> applied into net 4.7.
> 
> From Daniel, detect stuck queues due to BQL
> From Mohamad, fix tx timeout watchdog false alarm
> 
> Hopefully those two fixes will make it to -stable, assuming 
> 3947ca185999 ('net/mlx5e: Implement ndo_tx_timeout callback') was also 
> backported to -stable.

Series applied.


Re: [PATCH net-next] net: vrf: Documentation update

2016-07-13 Thread David Miller
From: David Ahern 
Date: Tue, 12 Jul 2016 15:04:23 -0600

> Update vrf documentation for changes made to 4.4 - 4.8 kernels
> and iproute2 support for vrf keyword.
> 
> Signed-off-by: David Ahern 

Applied, thanks David.


pull-request: wireless-drivers-next 2016-07-13

2016-07-13 Thread Kalle Valo
Hi Dave,

here's a pull request for net-next. This time there are few conflicts
due to the cfg80211 scan API changes, and one of them is easy to miss,
so please pay extra attention to them. Otherwise there's not nothing
really out of ordinary. Please note that I also pulled wireless-drivers
to wireless-drivers-next to reduce the amount of conflicts.

So about the conflicts, the obvious are notified by git:

CONFLICT (content): Merge conflict in 
drivers/net/wireless/marvell/mwifiex/cmdevt.c
CONFLICT (content): Merge conflict in 
drivers/net/wireless/intel/iwlwifi/mvm/scan.c

Basically the major change is that in iwlwifi del_timer() is changed to
cancel_delayed_work() and in mwifiex the code was refactored to use
mwifiex_cancel_scan(). But the tricky part comes here which is easy to
miss:

Auto-merging drivers/net/wireless/marvell/mwifiex/scan.c

You need to convert the scan code in mwifiex_cancel_scan():

cfg80211_scan_done(priv->scan_request, 1);

to use the new API:

struct cfg80211_scan_info info = {
.aborted = true,
};

[...]

cfg80211_scan_done(priv->scan_request, &info);

I have attached the output from git diff as an example how to resolve
this, hopefully that helps. Please let me know if there are any problems
or if you want to handle these differently.

Kalle


The following changes since commit 742fb20fd4c75bd08733b0ea232c7e0fa67a6f87:

  net: ethernet: ti: cpdma: switch to use genalloc (2016-06-29 04:16:11 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers-next.git 
tags/wireless-drivers-next-for-davem-2016-07-13

for you to fetch changes up to 25f700ef0653d7644ed273f8770230e734cae726:

  iwlwifi: add missing type declaration (2016-07-12 14:51:57 +0300)


wireless-drivers-next patches for 4.8

Major changes:

iwlwifi

* more work on the RX path for the 9000 device series
* some more dynamic queue allocation work
* SAR BIOS implementation
* some work on debugging capabilities
* added support for GCMP encryption
* data path rework in preparation for new HW
* some cleanup to remove transport dependency on mac80211
* support for MSIx in preparation for new HW
* lots of work in preparation for HW support (9000 and a000 series)

mwifiex

* implement get_tx_power and get_antenna cfg80211 operation callbacks

wl18xx

* add support for 64bit clock

rtl8xxxu

* aggregation support (optional for now)

Also wireless-drivers is merged to fix some conflicts.


Amitkumar Karwar (8):
  mwifiex: fix system hang problem after resume
  mwifiex: fix AP unable to start in VHT40 problem
  mwifiex: fix AP start problem for newly added interface
  mwifiex: code rearrangement in suspend handler
  mwifiex: clear scan_aborting flag
  mwifiex: fix NULL pointer dereference during suspend
  mwifiex: fix scan_block flag handling
  mwifiex: Change default firmware for PCIe8997 chipset

Andrei Otcheretianski (1):
  iwlwifi: mvm: Support CSA countdown offloading

Andy Shevchenko (1):
  rtl8xxxu: tuse %*ph to dump buffers

Arnd Bergmann (6):
  rtlwifi: use s8 instead of char
  wireless: airo: rename 'register' variable
  wireless: brcmsmac: fix old-style declaration
  wireless: ipw2200: fix old-style declaration
  iwlwifi: mvm: avoid harmless -Wmaybe-uninialized warning
  iwlwifi: add missing type declaration

Avraham Stern (1):
  iwlwifi: rename CAPA_P2P_STANDALONE_UAPSD to CAPA_P2P_SCM_UAPSD

Ayala Beker (2):
  iwlwifi: mvm: fix RX mpdu status enum
  iwlwifi: mvm: add support for GCMP encryption

Bhaktipriya Shridhar (1):
  libertas_tf: Remove create_workqueue

Brian Norris (1):
  mwifiex: mask PCIe interrupts before removal

Bruno Herrera (1):
  wlcore: sdio: Fix crash on wlcore_probe_of when failing to parse/map irq

Dan Carpenter (2):
  iwlwifi: mvm: remove an unused variable
  iwlwifi: mvm: silence uninitialized variable warning

Emmanuel Grumbach (7):
  iwlwifi: advertise maximal MPDU length when Rx MQ is supported
  iwlwifi: pcie: enable interrupts before releasing the NIC's CPU
  iwlwifi: mvm: cleanup the coex code
  iwlwifi: mvm: fix coex related comments
  iwlwifi: mvm: fix the channel inhibition table for Channel 14
  iwlwifi: mvm: unmap the paging memory before freeing it
  iwlwifi: pcie: fix a race in firmware loading flow

Ganapathi Bhat (1):
  mwifiex: Fix an issue spotted by KASAN

Golan Ben-Ami (2):
  iwlwifi: Reserve iwl_fw_error_dump_type enum
  iwlwifi: mvm: write the correct internal TXF index

Gregory Greenman (1):
  iwlwifi: mvm: rs: add rate scaling support for 160MHz channels

Guenter Roeck (1):
  iwlwifi: dvm: Remove unused array 'iwlagn_loose_lookup'

Guy Mishol (1):
  wlcore: reconfigure sta rates on authori

[PATCH net-next 6/6] sctp: only check for ECN if peer is using it

2016-07-13 Thread Marcelo Ricardo Leitner
Currently only read-only checks are performed up to the point on where
we check if peer is ECN capable, checks which we can avoid otherwise.
The flag ecn_ce_done is only used to perform this check once per
incoming packet, and nothing more.

Thus this patch moves the peer check up.

Signed-off-by: Marcelo Ricardo Leitner 
---
 net/sctp/sm_statefuns.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 
b7c1f7f3c8388400e51e3fbdbe099bc354559913..d88bb2b0b69913ad5962f9a5655d413f2c210ed0
 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -6118,12 +6118,11 @@ static int sctp_eat_data(const struct sctp_association 
*asoc,
 * chunk later.
 */
 
-   if (!chunk->ecn_ce_done) {
+   if (asoc->peer.ecn_capable && !chunk->ecn_ce_done) {
struct sctp_af *af = SCTP_INPUT_CB(chunk->skb)->af;
chunk->ecn_ce_done = 1;
 
-   if (af->is_ce(sctp_gso_headskb(chunk->skb)) &&
-   asoc->peer.ecn_capable) {
+   if (af->is_ce(sctp_gso_headskb(chunk->skb))) {
/* Do real work as sideffect. */
sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE,
SCTP_U32(tsn));
-- 
2.7.4



[PATCH net-next 0/6] sctp: allow GSO frags to access the chunk too

2016-07-13 Thread Marcelo Ricardo Leitner
Patchset is named after the most important fix in it. First two patches
are preparing the grounds for the 3rd patch.

After the 3rd, they are not strictly logically related to the patchset,
but I kept them together as they depend on each other.

More details on patch changelogs.

Thanks!

Marcelo Ricardo Leitner (6):
  sctp: allow others to use sctp_input_cb
  sctp: reorder sctp_ulpevent and shrink msg_flags
  sctp: allow GSO frags to access the chunk too
  sctp: avoid identifying address family many times for a chunk
  sctp: do not clear chunk->ecn_ce_done flag
  sctp: only check for ECN if peer is using it

 include/net/sctp/structs.h  | 23 +++
 include/net/sctp/ulpevent.h | 12 ++--
 net/sctp/input.c| 12 +---
 net/sctp/inqueue.c  |  9 -
 net/sctp/ipv6.c |  9 -
 net/sctp/protocol.c |  1 +
 net/sctp/sm_make_chunk.c| 20 
 net/sctp/sm_statefuns.c |  9 +++--
 net/sctp/socket.c   | 10 +++---
 net/sctp/ulpevent.c | 14 +++---
 10 files changed, 68 insertions(+), 51 deletions(-)

-- 
2.7.4



[PATCH net-next 1/6] sctp: allow others to use sctp_input_cb

2016-07-13 Thread Marcelo Ricardo Leitner
We process input path in other files too and having access to it is
nice, so move it to a header where it's shared.

Signed-off-by: Marcelo Ricardo Leitner 
---
 include/net/sctp/structs.h | 15 +++
 net/sctp/input.c   | 11 ---
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 
8626bdd3249a9283955fe81bc3255be0a18717f9..966c3a40039c12a7c525612594a51312d5de1d2a
 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -59,6 +59,7 @@
 #include/* We need tq_struct.*/
 #include /* We need sctp* header structs.  */
 #include  /* We need auth specific structs */
+#include /* For inet_skb_parm */
 
 /* A convenience structure for handling sockaddr structures.
  * We should wean ourselves off this.
@@ -1092,6 +1093,20 @@ static inline void sctp_outq_cork(struct sctp_outq *q)
q->cork = 1;
 }
 
+/* SCTP skb control block.
+ * sctp_input_cb is currently used on rx and sock rx queue
+ */
+struct sctp_input_cb {
+   union {
+   struct inet_skb_parmh4;
+#if IS_ENABLED(CONFIG_IPV6)
+   struct inet6_skb_parm   h6;
+#endif
+   } header;
+   struct sctp_chunk *chunk;
+};
+#define SCTP_INPUT_CB(__skb)   ((struct sctp_input_cb *)&((__skb)->cb[0]))
+
 /* These bind address data fields common between endpoints and associations */
 struct sctp_bind_addr {
 
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 
6f8e676d285ead987b0a1337beec3b29c34e0a8e..7a327ff71f08985f6ebb963d5cdc9540b23d0666
 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -90,17 +90,6 @@ static inline int sctp_rcv_checksum(struct net *net, struct 
sk_buff *skb)
return 0;
 }
 
-struct sctp_input_cb {
-   union {
-   struct inet_skb_parmh4;
-#if IS_ENABLED(CONFIG_IPV6)
-   struct inet6_skb_parm   h6;
-#endif
-   } header;
-   struct sctp_chunk *chunk;
-};
-#define SCTP_INPUT_CB(__skb)   ((struct sctp_input_cb *)&((__skb)->cb[0]))
-
 /*
  * This is the routine which IP calls when receiving an SCTP packet.
  */
-- 
2.7.4



[PATCH net-next 4/6] sctp: avoid identifying address family many times for a chunk

2016-07-13 Thread Marcelo Ricardo Leitner
Identifying address family operations during rx path is not something
expensive but it's ugly to the eye to have it done multiple times,
specially when we already validated it during initial rx processing.

This patch takes advantage of the now shared sctp_input_cb and make the
pointer to the operations readily available.

Signed-off-by: Marcelo Ricardo Leitner 
---
 include/net/sctp/structs.h |  1 +
 net/sctp/input.c   |  1 +
 net/sctp/inqueue.c |  1 +
 net/sctp/sm_make_chunk.c   | 20 
 net/sctp/sm_statefuns.c|  7 ++-
 5 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 
f6f201de6fa46b3ca203c00f4970ca408edb6930..ce93c4b10d2620a3ac4c9efe39a86e5d231b51c2
 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1104,6 +1104,7 @@ struct sctp_input_cb {
 #endif
} header;
struct sctp_chunk *chunk;
+   struct sctp_af *af;
 };
 #define SCTP_INPUT_CB(__skb)   ((struct sctp_input_cb *)&((__skb)->cb[0]))
 
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 
7a327ff71f08985f6ebb963d5cdc9540b23d0666..30d72f7707b6df5b41679bbfc5e595d5a11130ea
 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -140,6 +140,7 @@ int sctp_rcv(struct sk_buff *skb)
af = sctp_get_af_specific(family);
if (unlikely(!af))
goto discard_it;
+   SCTP_INPUT_CB(skb)->af = af;
 
/* Initialize local addresses for lookups. */
af->from_skb(&src, skb, 1);
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 
147d975b04559f7858b040b1f04dbc559ef2ec78..8fc773f9b59a8a9ad123dd132cfa5b7f916732b6
 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -224,6 +224,7 @@ new_skb:
*head_cb = SCTP_INPUT_CB(chunk->head_skb);
 
cb->chunk = head_cb->chunk;
+   cb->af = head_cb->af;
}
}
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 
1c96f4740e67397e5f8b7134cffd4d0840220245..8c77b87a8565cb4f82c09cea65557dc9c8d1138f
 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -108,14 +108,9 @@ static void sctp_control_set_owner_w(struct sctp_chunk 
*chunk)
 /* What was the inbound interface for this chunk? */
 int sctp_chunk_iif(const struct sctp_chunk *chunk)
 {
-   struct sctp_af *af;
-   int iif = 0;
-
-   af = sctp_get_af_specific(ipver2af(ip_hdr(chunk->skb)->version));
-   if (af)
-   iif = af->skb_iif(chunk->skb);
+   struct sk_buff *skb = chunk->skb;
 
-   return iif;
+   return SCTP_INPUT_CB(skb)->af->skb_iif(skb);
 }
 
 /* RFC 2960 3.3.2 Initiation (INIT) (1)
@@ -1600,7 +1595,6 @@ struct sctp_association *sctp_make_temp_asoc(const struct 
sctp_endpoint *ep,
struct sctp_association *asoc;
struct sk_buff *skb;
sctp_scope_t scope;
-   struct sctp_af *af;
 
/* Create the bare association.  */
scope = sctp_scope(sctp_source(chunk));
@@ -1610,16 +1604,10 @@ struct sctp_association *sctp_make_temp_asoc(const 
struct sctp_endpoint *ep,
asoc->temp = 1;
skb = chunk->skb;
/* Create an entry for the source address of the packet.  */
-   af = sctp_get_af_specific(ipver2af(ip_hdr(skb)->version));
-   if (unlikely(!af))
-   goto fail;
-   af->from_skb(&asoc->c.peer_addr, skb, 1);
+   SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1);
+
 nodata:
return asoc;
-
-fail:
-   sctp_association_free(asoc);
-   return NULL;
 }
 
 /* Build a cookie representing asoc.
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 
5aabf42065e2fba9388350996310b77c58369395..b7c1f7f3c8388400e51e3fbdbe099bc354559913
 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -6119,13 +6119,10 @@ static int sctp_eat_data(const struct sctp_association 
*asoc,
 */
 
if (!chunk->ecn_ce_done) {
-   struct sctp_af *af;
+   struct sctp_af *af = SCTP_INPUT_CB(chunk->skb)->af;
chunk->ecn_ce_done = 1;
 
-   af = sctp_get_af_specific(
-   ipver2af(ip_hdr(chunk->skb)->version));
-
-   if (af && af->is_ce(sctp_gso_headskb(chunk->skb)) &&
+   if (af->is_ce(sctp_gso_headskb(chunk->skb)) &&
asoc->peer.ecn_capable) {
/* Do real work as sideffect. */
sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE,
-- 
2.7.4



[PATCH net-next 5/6] sctp: do not clear chunk->ecn_ce_done flag

2016-07-13 Thread Marcelo Ricardo Leitner
We should not clear that flag when switching to a new skb from a GSO skb
because it would cause ECN processing to happen multiple times per GSO
skb, which is not wanted. Instead, let it be processed once per chunk.
That is, in other words, once per IP header available.

Fixes: 90017accff61 ("sctp: Add GSO support")
Signed-off-by: Marcelo Ricardo Leitner 
---
 net/sctp/inqueue.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 
8fc773f9b59a8a9ad123dd132cfa5b7f916732b6..942770675f4cc0efc9686f4e4038450f060f34ae
 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -217,7 +217,6 @@ new_skb:
chunk->auth = 0;
chunk->has_asconf = 0;
chunk->end_of_packet = 0;
-   chunk->ecn_ce_done = 0;
if (chunk->head_skb) {
struct sctp_input_cb
*cb = SCTP_INPUT_CB(chunk->skb),
-- 
2.7.4



[PATCH net-next 3/6] sctp: allow GSO frags to access the chunk too

2016-07-13 Thread Marcelo Ricardo Leitner
SCTP will try to access original IP headers on sctp_recvmsg in order to
copy the addresses used. There are also other places that do similar access
to IP or even SCTP headers. But after 90017accff61 ("sctp: Add GSO
support") they aren't always there because they are only present in the
header skb.

SCTP handles the queueing of incoming data by cloning the incoming skb
and limiting to only the relevant payload. This clone has its cb updated
to something different and it's then queued on socket rx queue. Thus we
need to fix this in two moments.

For rx path, not related to socket queue yet, this patch uses a
partially copied sctp_input_cb to such GSO frags. This restores the
ability to access the headers for this part of the code.

Regarding the socket rx queue, it removes iif member from sctp_event and
also add a chunk pointer on it.

With these changes we're always able to reach the headers again.

The biggest change here is that now the sctp_chunk struct and the
original skb are only freed after the application consumed the buffer.
Note however that the original payload was already like this due to the
skb cloning.

For iif, SCTP's IPv4 code doesn't use it, so no change is necessary.
IPv6 now can fetch it directly from original's IPv6 CB as the original
skb is still accessible.

In the future we probably can simplify sctp_v*_skb_iif() stuff, as
sctp_v4_skb_iif() was called but it's return value not used, and now
it's not even called, but such cleanup is out of scope for this change.

Fixes: 90017accff61 ("sctp: Add GSO support")
Signed-off-by: Marcelo Ricardo Leitner 
---
 include/net/sctp/structs.h  |  7 +++
 include/net/sctp/ulpevent.h |  2 +-
 net/sctp/inqueue.c  |  7 +++
 net/sctp/ipv6.c |  9 -
 net/sctp/protocol.c |  1 +
 net/sctp/sm_statefuns.c |  3 ++-
 net/sctp/socket.c   | 10 +++---
 net/sctp/ulpevent.c | 10 +-
 8 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 
966c3a40039c12a7c525612594a51312d5de1d2a..f6f201de6fa46b3ca203c00f4970ca408edb6930
 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1107,6 +1107,13 @@ struct sctp_input_cb {
 };
 #define SCTP_INPUT_CB(__skb)   ((struct sctp_input_cb *)&((__skb)->cb[0]))
 
+static inline const struct sk_buff *sctp_gso_headskb(const struct sk_buff *skb)
+{
+   const struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk;
+
+   return chunk->head_skb ? : skb;
+}
+
 /* These bind address data fields common between endpoints and associations */
 struct sctp_bind_addr {
 
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 
aa342645dbce446186d55151c3f507cf0e165b44..2c098cd7e7e202b6fa96e97ccb56471df27cec91
 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -48,11 +48,11 @@
  */
 struct sctp_ulpevent {
struct sctp_association *asoc;
+   struct sctp_chunk *chunk;
unsigned int rmem_len;
__u32 ppid;
__u32 tsn;
__u32 cumtsn;
-   int iif;
__u16 stream;
__u16 ssn;
__u16 flags;
diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c
index 
edabbbdfca541b830526a7a52aee18c20680c19c..147d975b04559f7858b040b1f04dbc559ef2ec78
 100644
--- a/net/sctp/inqueue.c
+++ b/net/sctp/inqueue.c
@@ -218,6 +218,13 @@ new_skb:
chunk->has_asconf = 0;
chunk->end_of_packet = 0;
chunk->ecn_ce_done = 0;
+   if (chunk->head_skb) {
+   struct sctp_input_cb
+   *cb = SCTP_INPUT_CB(chunk->skb),
+   *head_cb = SCTP_INPUT_CB(chunk->head_skb);
+
+   cb->chunk = head_cb->chunk;
+   }
}
 
chunk->chunk_hdr = ch;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 
0657d18a85bf7aa751a0456d0cc9adae3ff95e42..ae6f1a2178bab81fa14562bd1c37d1e7b1e3
 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -420,6 +420,7 @@ static void sctp_v6_from_skb(union sctp_addr *addr, struct 
sk_buff *skb,
addr->v6.sin6_flowinfo = 0; /* FIXME */
addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif;
 
+   /* Always called on head skb, so this is safe */
sh = sctp_hdr(skb);
if (is_saddr) {
*port  = sh->source;
@@ -710,8 +711,7 @@ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union 
sctp_addr *addr)
 /* Where did this skb come from?  */
 static int sctp_v6_skb_iif(const struct sk_buff *skb)
 {
-   struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb;
-   return opt->iif;
+   return IP6CB(skb)->iif;
 }
 
 /* Was this packet marked by Explicit Congestion Notification? */
@@ -780,15 +780,14 @@ static void sctp_inet6_skb_msgname(struct sk_buff *skb, 
char *msgname,
if (ip_hdr(skb)->version == 4) {
addr->v4.sin_family = AF_INET;

[PATCH net-next 2/6] sctp: reorder sctp_ulpevent and shrink msg_flags

2016-07-13 Thread Marcelo Ricardo Leitner
The next patch needs 8 bytes in there. sctp_ulpevent has a hole due to
bad alignment; msg_flags is using 4 bytes while it actually uses only 2, so
we shrink it, and iif member (4 bytes) which can be easily fetched from
another place once the next patch is there, so we remove it and thus
creating space for 8 bytes.

Signed-off-by: Marcelo Ricardo Leitner 
---
 include/net/sctp/ulpevent.h | 10 +-
 net/sctp/ulpevent.c |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 
cccdcfd149736b315554d64c2a556e0ad6496fc8..aa342645dbce446186d55151c3f507cf0e165b44
 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -48,15 +48,15 @@
  */
 struct sctp_ulpevent {
struct sctp_association *asoc;
-   __u16 stream;
-   __u16 ssn;
-   __u16 flags;
+   unsigned int rmem_len;
__u32 ppid;
__u32 tsn;
__u32 cumtsn;
-   int msg_flags;
int iif;
-   unsigned int rmem_len;
+   __u16 stream;
+   __u16 ssn;
+   __u16 flags;
+   __u16 msg_flags;
 };
 
 /* Retrieve the skb this event sits inside of. */
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 
d1e38308f6159c0e4da7db966c16afc6a956b554..706f5bc9f0c3083ab455ec78b963cd609a3a95b5
 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -51,7 +51,7 @@ static void sctp_ulpevent_release_frag_data(struct 
sctp_ulpevent *event);
 
 /* Initialize an ULP event from an given skb.  */
 static void sctp_ulpevent_init(struct sctp_ulpevent *event,
-  int msg_flags,
+  __u16 msg_flags,
   unsigned int len)
 {
memset(event, 0, sizeof(struct sctp_ulpevent));
@@ -60,7 +60,7 @@ static void sctp_ulpevent_init(struct sctp_ulpevent *event,
 }
 
 /* Create a new sctp_ulpevent.  */
-static struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
+static struct sctp_ulpevent *sctp_ulpevent_new(int size, __u16 msg_flags,
   gfp_t gfp)
 {
struct sctp_ulpevent *event;
-- 
2.7.4



Re: [patch v2] net/mlx5: missing error code in esw_create_offloads_fdb_table()

2016-07-13 Thread Jason Gunthorpe
On Wed, Jul 13, 2016 at 02:48:44PM +0300, Dan Carpenter wrote:
> We accidentally return success when we had intended to return an error
> code.
> 
> Fixes: 69697b6e2086 ('net/mlx5: E-Switch, Add support for the sriov offloads 
> mode')
> Signed-off-by: Dan Carpenter 
> v2: return -ENOTSUPP instead --EINVAL
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> index 1842dfb..7d982cf 100644
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> @@ -183,6 +183,7 @@ static int esw_create_offloads_fdb_table(struct 
> mlx5_eswitch *esw, int nvports)
>  
>   root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
>   if (!root_ns) {
> + err = -ENOTSUPP;

Did you mean ENOTSUP?

I thought ENOTSUPP was not to be used outside NFS, and isn't properly
exported to userspace..

$ find /usr/include -name "*errno*" | xargs grep 524

Jason


Re: [PATCH net] sock_diag: invert socket destroy broadcast check

2016-07-13 Thread Willem de Bruijn
On Fri, Jun 24, 2016 at 6:22 PM, Willem de Bruijn
 wrote:
> On Fri, Jun 24, 2016 at 4:41 PM, Eric W. Biederman
>  wrote:
>> Willem de Bruijn  writes:
>>
>>> From: Willem de Bruijn 
>>>
>>> Socket destruction is only broadcast for a socket sk if a diag
>>> listener is registered and sk is not a kernel socket.
>>>
>>> Invert the test to not even check for listeners for kernel sockets.
>>>
>>> The sock_diag_has_destroy_listeners invocation dereferences
>>> sock_net(sk), which for kernel sockets can be invalid as they do not
>>> take a reference on the network namespace.
>>
>> No.  That isn't so.  A kernel socket for a network namespace must be
>> destroyed in the network namespace teardown.

I spent some more time looking at this.

inet_ctl_sock_destroy does not destroy the socket if there are still
skbuff with a reference on it (or its sk_wmem_alloc). Skbs are
orphaned when they leave the namespace through dev_forward_skb, but
not when sent out a physical nic (correctly, that would break TSQ).

The bug happened with macvlan on top of bonding on top of a physical
nic. The macvlan lives in a temporary namespace. After the macvlan and
network namespace are destroyed, the physical device has a TCP RST skb
from net.ipv4->tcp_sk queued for tx completion.

I have not able to reproduce this exact scenario, likely because tx
completion handling is on the order of microseconds and not easily
slowed sufficiently for testing. Using a tap device with skb_orphan
commented out, I can cause the issue. Commenting out skb_orrphan is
clearly a gross hack. The point I wanted to verify is that underlying
device is not stopped --and its queues cleaned of skb-- when the
macvlan device is destroyed.

Network namespace teardown is complex. Am I missing a step that does
prevents the above, or does this indeed sound feasible in principle
(if very unlikely in practice)?


Re: Configure traffic class to bringup DCB in back-to-back setup

2016-07-13 Thread John Fastabend
On 16-07-13 02:09 AM, ayuj wrote:
> I just checked TLV's. Below are the details:
> 

OK so not really a netdev discussion seeing its just a user
space protocol setup issue. Going forward probably drop
netdev and add intel-wired-lan.

> OS :- CentOS 7.2
> kernel 3.10.0-327.el7.x86_64
> lldpad:- lldpad v0.9.46
> dcbtool:- v0.9.46
> ixgbe :- ixgbe-4.3.15
> 
> steps followed:- 
> 
> # modporbe ixgbe
> # service lldpad start 
> Redirecting to /bin/systemctl start  lldpad.service
> 
> # service lldpad status
> Redirecting to /bin/systemctl status  lldpad.service
> ● lldpad.service - Link Layer Discovery Protocol Agent Daemon.
>Loaded: loaded (/usr/lib/systemd/system/lldpad.service; disabled; vendor
> preset: disabled)
>Active: active (running) since Tue 2016-07-05 05:49:12 EDT; 1s ago
>  Main PID: 133737 (lldpad)
>CGroup: /system.slice/lldpad.service
>└─133737 /usr/sbin/lldpad -t
> 
> Jul 05 05:49:12 localhost.localdomain systemd[1]: Started Link Layer
> Discovery Protocol Agent Daemon..
> Jul 05 05:49:12 localhost.localdomain systemd[1]: Starting Link Layer
> Discovery Protocol Agent Daemon
> 
> lldptool -t -i p3p2 -n
> Chassis ID TLV
>   MAC: 00:1b:21:bb:2e:da
> Port ID TLV
>   MAC: 00:1b:21:bb:2e:da
> Time to Live TLV
>   120
> IEEE 8021QAZ ETS Configuration TLV
>Willing: yes
>CBS: not supported
>MAX_TCS: 8
>PRIO_MAP: 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0 
>TC Bandwidth: 0% 0% 0% 0% 0% 0% 0% 0% 
>TSA_MAP: 0:strict 1:strict 2:strict 3:strict 4:strict 5:strict 6:strict
> 7:strict 
> IEEE 8021QAZ PFC TLV
>Willing: yes
>MACsec Bypass Capable: no
>PFC capable traffic classes: 8
>PFC enabled: none
> End of LLDPDU TLV
> 
> Please help me in configuring traffic classes. I want to bringup DCB setup
> in a back-to-back senario.
> 

So at the moment it appears to be configured to use 802.1QAZ spec which
superseded the older spec even though lldpad supports both. Note the
tool itself really requires some spec knowledge to use correctly. The
spec to read is 802.1Q.

To configure it back-to-back (typical scenario is connected to a DCB
enabled switch where your administrator would setup the switch and this
would autoneg just fine) the servers need to be setup manually.

Perhaps reading if you haven't already the man page for lldptool and
lldptool-ets, lldptool-pfc would help. From the ets man page this
should kick things off,


#lldptool -T -i eth2 -V ETS-CFG \
  tsa=0:ets,1:ets,2:ets,3:ets,4:ets,5:ets,6:ets,7:ets \
  up2tc=0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7 \
  tcbw=12,12,12,12,13,13,13,13


#lldptool -T -i eth2 -V ETS-REC \
   tsa=0:ets,1:ets,2:ets,3:ets,4:ets,5:ets,6:ets,7:ets \
   up2tc=0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7 \
   tcbw=12,12,12,12,13,13,13,13

Thanks,
John




Re: [patch v2] net/mlx5: missing error code in esw_create_offloads_fdb_table()

2016-07-13 Thread Leon Romanovsky
On Wed, Jul 13, 2016 at 02:48:44PM +0300, Dan Carpenter wrote:
> We accidentally return success when we had intended to return an error
> code.
> 
> Fixes: 69697b6e2086 ('net/mlx5: E-Switch, Add support for the sriov offloads 
> mode')
> Signed-off-by: Dan Carpenter 
> ---
> v2: return -ENOTSUPP instead --EINVAL

I'm a little bit confused. Why did you prefer ENOTSUPP over EOPNOTSUPP?

Thanks.


signature.asc
Description: Digital signature


[PATCH 1/2] net: nps_enet: fix coding style issues

2016-07-13 Thread Elad Kanfi
From: Elad Kanfi 

Fix following coding style problems :

ERROR: else should follow close brace '}'
+   }
+   else { /* !dst_is_aligned */

WARNING: Missing a blank line after declarations
+   u32 buf = nps_enet_reg_get(priv, NPS_ENET_REG_RX_BUF);
+   put_unaligned_be32(buf, reg);

WARNING: Missing a blank line after declarations
+   u32 buf;
+   ioread32_rep(priv->regs_base + NPS_ENET_REG_RX_BUF, &buf, 1);

CHECK: Blank lines aren't necessary before a close brace '}'
+
+   }

total: 1 errors, 2 warnings, 1 checks, 683 lines checked

Signed-off-by: Elad Kanfi 
---
 drivers/net/ethernet/ezchip/nps_enet.c |6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/ezchip/nps_enet.c 
b/drivers/net/ethernet/ezchip/nps_enet.c
index 06f0317..b182e2a 100644
--- a/drivers/net/ethernet/ezchip/nps_enet.c
+++ b/drivers/net/ethernet/ezchip/nps_enet.c
@@ -46,16 +46,17 @@ static void nps_enet_read_rx_fifo(struct net_device *ndev,
if (dst_is_aligned) {
ioread32_rep(priv->regs_base + NPS_ENET_REG_RX_BUF, reg, len);
reg += len;
-   }
-   else { /* !dst_is_aligned */
+   } else { /* !dst_is_aligned */
for (i = 0; i < len; i++, reg++) {
u32 buf = nps_enet_reg_get(priv, NPS_ENET_REG_RX_BUF);
+
put_unaligned_be32(buf, reg);
}
}
/* copy last bytes (if any) */
if (last) {
u32 buf;
+
ioread32_rep(priv->regs_base + NPS_ENET_REG_RX_BUF, &buf, 1);
memcpy((u8 *)reg, &buf, last);
}
@@ -459,7 +460,6 @@ static void nps_enet_set_rx_mode(struct net_device *ndev)
 | NPS_ENET_ENABLE << CFG_2_DISK_DA_SHIFT;
ge_mac_cfg_2_value = (ge_mac_cfg_2_value & ~CFG_2_DISK_MC_MASK)
 | NPS_ENET_ENABLE << CFG_2_DISK_MC_SHIFT;
-
}
 
nps_enet_reg_set(priv, NPS_ENET_REG_GE_MAC_CFG_2, ge_mac_cfg_2_value);
-- 
1.7.1



Re: [PATCH v7 09/11] net/mlx4_en: add xdp forwarding and data write support

2016-07-13 Thread Brenden Blanco
On Wed, Jul 13, 2016 at 06:25:28PM +0300, Saeed Mahameed wrote:
> On Tue, Jul 12, 2016 at 12:29 AM, Brenden Blanco  wrote:
> > A user will now be able to loop packets back out of the same port using
> > a bpf program attached to xdp hook. Updates to the packet contents from
> > the bpf program is also supported.
> >
> > For the packet write feature to work, the rx buffers are now mapped as
> > bidirectional when the page is allocated. This occurs only when the xdp
> > hook is active.
> >
> > When the program returns a TX action, enqueue the packet directly to a
> > dedicated tx ring, so as to avoid completely any locking. This requires
> > the tx ring to be allocated 1:1 for each rx ring, as well as the tx
> > completion running in the same softirq.
> >
> > Upon tx completion, this dedicated tx ring recycles pages without
> > unmapping directly back to the original rx ring. In steady state tx/drop
> > workload, effectively 0 page allocs/frees will occur.
> >
> > Signed-off-by: Brenden Blanco 
> > ---
> >  drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |  15 ++-
> >  drivers/net/ethernet/mellanox/mlx4/en_netdev.c  |  19 +++-
> >  drivers/net/ethernet/mellanox/mlx4/en_rx.c  |  14 +++
> >  drivers/net/ethernet/mellanox/mlx4/en_tx.c  | 126 
> > +++-
> >  drivers/net/ethernet/mellanox/mlx4/mlx4_en.h|  14 ++-
> >  5 files changed, 181 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c 
> > b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> > index d3d51fa..10642b1 100644
> > --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> > @@ -1694,6 +1694,11 @@ static int mlx4_en_set_rxnfc(struct net_device *dev, 
> > struct ethtool_rxnfc *cmd)
> > return err;
> >  }
> >
> > +static int mlx4_en_max_tx_channels(struct mlx4_en_priv *priv)
> > +{
> > +   return (MAX_TX_RINGS - priv->rsv_tx_rings) / MLX4_EN_NUM_UP;
> > +}
> > +
> 
> MAX_TX_RING is a software limitation made to limit netdev real_num_tx
> queues for CX3 internal cache utilization,
> in your case the netdev doesn't care about xdp_tx rings, the
> accounting you added in this patch adds a  lot of
> complications and it would be better to have clear separation between
> the two types of tx_rings, in terms of the holding/managing data
> structure.
> 
> I suggest here to leave priv->tx_ring untouched. i.e, don't store the
> xdp_tx rings at the end of it, i.e  priv->tx_ring should only reflect
> the
> netdev real tx queues.
> 
> In case of priv->porg is active, we can allocate and store xdp tx ring
> per rx ring, this tx ring will be allocated and activated
> once the rx ring is created and activated, and store this dedicated tx
> ring  in the rx_ring it self.
> 
> i.e :
> struct mlx4_en_rx_ring {
> [...]
> struct mlx4_en_tx_ring *xdp_tx;
> struct mlx4_en_cq *xdp_tx_cq;
> [...]
> }
> 
> for this the following changes are required.
> 
> @ mlx4_en_create_rx_ring
> [...] // Create the RX ring
> 
> /* create CQ for xdp tx ring */
> node = cpu_to_node(i % num_online_cpus());
> 
> mlx4_en_create_cq(priv, &rx_ring->xdp_tx_cq, prof->tx_ring_size, i, TX, node)
> 
> /* create xdp tx ring */
> mlx4_en_create_tx_ring(priv, &rx_ring->xdp_tx, prof->tx_ring_size,
> TXBB_SIZE, node, -1)
> 
> @mlx4_en_start/stop_port
> /* Configure tx cq's and rings */
> // You will need to configure xdp tx rings same as priv->rx_ring_num rings
> 
> @mlx4_en_poll_tx_cq
> This Also will require a new NAPI handler for xdp rings to replace the
> following line @mlx4_en_poll_tx_cq
> - struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
> with
> + struct mlx4_en_tx_ring *ring = priv->rx_ring[cq->ring].xdp_tx;
> 
> Or just change cq->ring from ring index to the actual ring pointer.
> 
> Bottom line, my suggestion also started to look complicated :).. but
> still it would look cleaner to separate between netdev rings and xdp
> rings.
> 
I considered this at first too, but it seemed the worse option to me at
the time. There would be a lot of copy/paste as well as new code to
review.
> 
> >  static void mlx4_en_get_channels(struct net_device *dev,
> >  struct ethtool_channels *channel)
> >  {
> > @@ -1705,7 +1710,7 @@ static void mlx4_en_get_channels(struct net_device 
> > *dev,
> > channel->max_tx = MLX4_EN_MAX_TX_RING_P_UP;
> >
> > channel->rx_count = priv->rx_ring_num;
> > -   channel->tx_count = priv->tx_ring_num / MLX4_EN_NUM_UP;
> > +   channel->tx_count = priv->num_tx_rings_p_up;
> >  }
> >
> >  static int mlx4_en_set_channels(struct net_device *dev,
> > @@ -1717,7 +1722,7 @@ static int mlx4_en_set_channels(struct net_device 
> > *dev,
> > int err = 0;
> >
> > if (channel->other_count || channel->combined_count ||
> > -   channel->tx_count > MLX4_EN_MAX_TX_RING_P_UP ||
> > +   channel->tx_count > mlx4_en_max_tx_channels(priv) ||
> > channel->rx_coun

Re: [PATCH 1/1] tracing, bpf: Implement function bpf_probe_write

2016-07-13 Thread Alexei Starovoitov
On Wed, Jul 13, 2016 at 03:36:11AM -0700, Sargun Dhillon wrote:
> Provides BPF programs, attached to kprobes a safe way to write to
> memory referenced by probes. This is done by making probe_kernel_write
> accessible to bpf functions via the bpf_probe_write helper.

not quite :)

> Signed-off-by: Sargun Dhillon 
> ---
>  include/uapi/linux/bpf.h  |  3 +++
>  kernel/trace/bpf_trace.c  | 20 
>  samples/bpf/bpf_helpers.h |  2 ++
>  3 files changed, 25 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 406459b..355b565 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -313,6 +313,9 @@ enum bpf_func_id {
>   */
>   BPF_FUNC_skb_get_tunnel_opt,
>   BPF_FUNC_skb_set_tunnel_opt,
> +
> + BPF_FUNC_probe_write, /* int bpf_probe_write(void *dst, void *src,
> int size) */
> +

the patch is against some old kernel.
Please always make the patch against net-next tree and cc netdev list.

> +static u64 bpf_probe_write(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
> +{
> + void *dst = (void *) (long) r1;
> + void *unsafe_ptr = (void *) (long) r2;
> + int  size = (int) r3;
> +
> + return probe_kernel_write(dst, unsafe_ptr, size);
> +}

the patch is whitepsace mangled. Please see 
Documentation/networking/netdev-FAQ.txt

the main issue though that we cannot simply allow bpf to do probe_write,
since it may crash the kernel.
What might be ok is to allow writing into memory of current
user space process only. This way bpf prog will keep kernel safety guarantees,
yet it will be able to modify user process memory when necessary.
Since bpf+tracing is root only, it doesn't pose security risk.



Re: [patch] net/mlx5: missing error code in esw_create_offloads_fdb_table()

2016-07-13 Thread Matan Barak

On 13/07/2016 13:08, Dan Carpenter wrote:

We accidentally return success when we had intended to return an error
code.

Fixes: 69697b6e2086 ('net/mlx5: E-Switch, Add support for the sriov offloads 
mode')
Signed-off-by: Dan Carpenter 

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c 
b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 1842dfb..7d982cf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -183,6 +183,7 @@ static int esw_create_offloads_fdb_table(struct 
mlx5_eswitch *esw, int nvports)

root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
if (!root_ns) {
+   err = -EINVAL;
esw_warn(dev, "Failed to get FDB flow namespace\n");
goto ns_err;
}



Hi,

Thanks for the patch.
I'm not sure EINVAL is the right error here though.
Maybe -ENOTSUPP is a bit more appropriate here.

Regards,
Matan


Re: [PATCH net-next 1/3] perf, events: add non-linear data support for raw records

2016-07-13 Thread Daniel Borkmann

On 07/13/2016 06:40 PM, Peter Zijlstra wrote:

On Wed, Jul 13, 2016 at 04:08:55PM +0200, Daniel Borkmann wrote:

On 07/13/2016 03:42 PM, Peter Zijlstra wrote:


Ok so the nonlinear thing was it doing _two_ copies, one the regular
__output_copy() on raw->data and second the optional fragment thingy
using __output_custom().

Would something like this work instead?

It does the nonlinear thing and the custom copy function thing but
allows more than 2 fragments and allows each fragment to have a custom
copy.

It doesn't look obviously more expensive; it has the one ->copy branch
extra, but then it doesn't recompute the sizes.


Yes, that would work as well on a quick glance with diff just a bit
bigger, but more generic this way. Do you want me to adapt this into
the first patch?


Please.


One question below:




-   u64 zero = 0;



-   if (real_size - raw_size)
-   __output_copy(handle, &zero, real_size - 
raw_size);



We still need the zero padding here from above with the computed
raw->size, right?


Ah, yes, we need some __output*() in order to advance the handle offset.
We don't _need_ to copy the 0s, but I doubt __output_skip() is much
cheaper for these 1-3 bytes worth of data; we've already touched that
line anyway.


Okay, thanks for your input! I'll respin then.


[PATCH] bonding: set carrier off for devices created through netlink

2016-07-13 Thread Beniamino Galvani
Commit e826eafa65c6 ("bonding: Call netif_carrier_off after
register_netdevice") moved netif_carrier_off() from bond_init() to
bond_create(), but the latter is called only for initial default
devices and ones created through sysfs:

 $ modprobe bonding
 $ echo +bond1 > /sys/class/net/bonding_masters
 $ ip link add bond2 type bond
 $ grep "MII Status" /proc/net/bonding/*
 /proc/net/bonding/bond0:MII Status: down
 /proc/net/bonding/bond1:MII Status: down
 /proc/net/bonding/bond2:MII Status: up

Ensure that carrier is initially off also for devices created through
netlink.

Signed-off-by: Beniamino Galvani 
---
 drivers/net/bonding/bond_netlink.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_netlink.c 
b/drivers/net/bonding/bond_netlink.c
index db760e8..b8df0f5 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -446,7 +446,11 @@ static int bond_newlink(struct net *src_net, struct 
net_device *bond_dev,
if (err < 0)
return err;
 
-   return register_netdevice(bond_dev);
+   err = register_netdevice(bond_dev);
+
+   netif_carrier_off(bond_dev);
+
+   return err;
 }
 
 static size_t bond_get_size(const struct net_device *bond_dev)
-- 
2.5.5



[PATCH] rndis_host: Set random MAC for ZTE MF910

2016-07-13 Thread Kristian Evensen
From: Kristian Evensen 

All ZTE MF910 mifis, at least on some revisions, export the same MAC
address (36:4b:50:b7:ef:da). Check for this MAC address and set a random
MAC if detected.

Also, changed the memcpy() to ether_addr_copy(), as pointed out by
checkpatch.

Signed-off-by: Kristian Evensen 
---
 drivers/net/usb/rndis_host.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c
index 524a47a281..85bdbdf 100644
--- a/drivers/net/usb/rndis_host.c
+++ b/drivers/net/usb/rndis_host.c
@@ -295,6 +295,9 @@ static const struct net_device_ops rndis_netdev_ops = {
.ndo_validate_addr  = eth_validate_addr,
 };
 
+/* well-known buggy ZTE MF910 MAC address */
+static const u8 buggy_zte_addr[ETH_ALEN] = {0x36, 0x4b, 0x50, 0xb7, 0xef, 
0xda};
+
 int
 generic_rndis_bind(struct usbnet *dev, struct usb_interface *intf, int flags)
 {
@@ -428,7 +431,11 @@ generic_rndis_bind(struct usbnet *dev, struct 
usb_interface *intf, int flags)
dev_err(&intf->dev, "rndis get ethaddr, %d\n", retval);
goto halt_fail_and_release;
}
-   memcpy(net->dev_addr, bp, ETH_ALEN);
+
+   if (ether_addr_equal(bp, buggy_zte_addr))
+   eth_hw_addr_random(net);
+   else
+   ether_addr_copy(net->dev_addr, bp);
 
/* set a nonzero filter to enable data transfers */
memset(u.set, 0, sizeof *u.set);
-- 
2.5.0



Re: [PATCH net-next 1/3] perf, events: add non-linear data support for raw records

2016-07-13 Thread Peter Zijlstra
On Wed, Jul 13, 2016 at 04:08:55PM +0200, Daniel Borkmann wrote:
> Hi Peter,
> 
> On 07/13/2016 03:42 PM, Peter Zijlstra wrote:
> >
> >Ok so the nonlinear thing was it doing _two_ copies, one the regular
> >__output_copy() on raw->data and second the optional fragment thingy
> >using __output_custom().
> >
> >Would something like this work instead?
> >
> >It does the nonlinear thing and the custom copy function thing but
> >allows more than 2 fragments and allows each fragment to have a custom
> >copy.
> >
> >It doesn't look obviously more expensive; it has the one ->copy branch
> >extra, but then it doesn't recompute the sizes.
> 
> Yes, that would work as well on a quick glance with diff just a bit
> bigger, but more generic this way. Do you want me to adapt this into
> the first patch?

Please.

> One question below:
> 

> >-u64 zero = 0;

> >-if (real_size - raw_size)
> >-__output_copy(handle, &zero, real_size - 
> >raw_size);

> 
> We still need the zero padding here from above with the computed
> raw->size, right?

Ah, yes, we need some __output*() in order to advance the handle offset.
We don't _need_ to copy the 0s, but I doubt __output_skip() is much
cheaper for these 1-3 bytes worth of data; we've already touched that
line anyway.


Re: [PATCH v3] Marvell phy: add fiber status check and configuration for some phys

2016-07-13 Thread Andrew Lunn
>  +static int marvell_resume_fiber(struct phy_device *phydev)
>  +{
>  +int err;
>  +
>  +/* Resume the fiber mode first */
>  +err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M_FIBER);
>  +if (err < 0)
>  +goto error;
>  +
>  +err = genphy_resume(phydev);
>  +if (err < 0)
>  +goto error;
>  +
>  +/* Then, the copper link */
>  +err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M_COPPER);
>  +if (err < 0)
>  +goto error;
>  +
>  +return genphy_resume(phydev);
> >>>
> >>> Should it be resumed twice? Or just once at the end?  Same question
> >>> for suspend.
> >>
> >> I don't understand your question.
> > 
> > You call genphy_resume(phydev) twice. Once is sufficient.
> 
> Yes, but it's normal because each interface could be suspended or resumed 
> independently.

> genphy_* functions use BMCR register which are identical between
> fiber and copper link. But each link has its own register to change.

Ah! Now i get it. I think you need a comment here. Something like:

/* With the page set, use the generic resume */

What i was worried about is that there is some reference counting
going on inside these functions. And so suspending the same phydev
multiple times will mess up the reference counts. But no, it just
twiddles a register bit, so that is O.K.

 Andrew


Re: [PATCH v8 06/11] net/mlx4_en: add page recycle to prepare rx ring for tx support

2016-07-13 Thread Brenden Blanco
On Wed, Jul 13, 2016 at 10:17:26AM +0300, Tariq Toukan wrote:
> 
> On 13/07/2016 3:54 AM, Brenden Blanco wrote:
> >On Tue, Jul 12, 2016 at 02:18:32PM -0700, David Miller wrote:
> >>From: Brenden Blanco 
> >>Date: Tue, 12 Jul 2016 00:51:29 -0700
> >>
> >>>+  mlx4_en_free_resources(priv);
> >>>+
> >>>   old_prog = xchg(&priv->prog, prog);
> >>>   if (old_prog)
> >>>   bpf_prog_put(old_prog);
> >>>-  return 0;
> >>>+  err = mlx4_en_alloc_resources(priv);
> >>>+  if (err) {
> >>>+  en_err(priv, "Failed reallocating port resources\n");
> >>>+  goto out;
> >>>+  }
> >>>+  if (port_up) {
> >>>+  err = mlx4_en_start_port(dev);
> >>>+  if (err)
> >>>+  en_err(priv, "Failed starting port\n");
> >>A failed configuration operation should _NEVER_ leave the interface in
> >>an inoperative state like these error paths do.
> >>
> >>You must instead preallocate the necessary resources, and only change
> >>the chip's configuration and commit to the new settings once you have
> >>successfully allocated those resources.
> >I'll see what I can do here.
> That's exactly what we're doing in a patchset that will be submitted
> to net very soon (this week).
Thanks Tariq!
As an example, I had originally tried to integrate this code into
mlx4_en_set_channels, which seems to have the same problem.
> It fixes/refactors these failure flows just like Dave described,
> something like:
> 
> err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof);
> if (err)
> goto out;
> 
> if (priv->port_up) {
> port_up = 1;
> mlx4_en_stop_port(dev, 1);
> }
> 
> mlx4_en_safe_replace_resources(priv, tmp);
> 
> if (port_up) {
> err = mlx4_en_start_port(dev);
> if (err)
> en_err(priv, "Failed starting port\n");
> }
> 
> I suggest you keep your code aligned with current net-next driver,
> and later I will take it and fix it (once merged with net).
Another option is to avoid entirely the tx_ring_num change, so as to
keep the majority of the initialized state valid. We would only allocate
a new set of pages and refill the rx rings once we have confirmed there
are enough resources.

So others can follow the discussion, there are multiple reasons to
reconfigure the rings.
1. The rx frags should be page-per-packet
2. The pages should be mapped DMA_BIDIRECTIONAL
3. Each rx ring should have a dedicated tx ring, which is off limits
from the upper stack
4. The dedicated tx ring will have a pointer back to its rx ring for
recycling

#1 and #2 can be done to the side ahead of time, as you are also
suggesting.

Currently, to achieve #3, we increase tx_ring_num while keeping
num_tx_rings_p_up the same. This precipitates a round of
free/alloc_resources, which takes some time and has many opportunities
for failure.
However, we could resurrect an earlier approach that keeps the
tx_ring_num unchanged, and instead just do a
netif_set_real_num_tx_queues(tx_ring_num - rsv_tx_rings) to hide it from
the stack. This would require that there be enough rings ahead of time,
with a simple bounds check like:
if (tx_ring_num < rsv_tx_rings + MLX4_EN_MAX_TX_RING_P_UP) {
en_err(priv, "XDP requires minimum %d + %d rings\n", rsv_tx_rings,
MLX4_EN_MAX_TX_RING_P_UP);
return -EINVAL;
}
The default values for tx_ring_num and rx_ring_num will only hit this
case when operating in a low memory environment, in which case the user
must increase the number of channels manually. I think that is a fair
tradeoff.

The rest of #1, #2, and #4 can be done in a guaranteed fashion once the
buffers are allocated, since it would just be a few loops to refresh the
rx_desc and recycle_ring.
> 
> Regards,
> Tariq


Re: [patch v2] net/mlx5: missing error code in esw_create_offloads_fdb_table()

2016-07-13 Thread Matan Barak

On 13/07/2016 14:48, Dan Carpenter wrote:

We accidentally return success when we had intended to return an error
code.

Fixes: 69697b6e2086 ('net/mlx5: E-Switch, Add support for the sriov offloads 
mode')
Signed-off-by: Dan Carpenter 
---
v2: return -ENOTSUPP instead --EINVAL

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c 
b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 1842dfb..7d982cf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -183,6 +183,7 @@ static int esw_create_offloads_fdb_table(struct 
mlx5_eswitch *esw, int nvports)

root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB);
if (!root_ns) {
+   err = -ENOTSUPP;
esw_warn(dev, "Failed to get FDB flow namespace\n");
goto ns_err;
}



Thanks.

Reviewed-by: Matan Barak 


Re: [PATCH v7 09/11] net/mlx4_en: add xdp forwarding and data write support

2016-07-13 Thread Saeed Mahameed
On Tue, Jul 12, 2016 at 12:29 AM, Brenden Blanco  wrote:
> A user will now be able to loop packets back out of the same port using
> a bpf program attached to xdp hook. Updates to the packet contents from
> the bpf program is also supported.
>
> For the packet write feature to work, the rx buffers are now mapped as
> bidirectional when the page is allocated. This occurs only when the xdp
> hook is active.
>
> When the program returns a TX action, enqueue the packet directly to a
> dedicated tx ring, so as to avoid completely any locking. This requires
> the tx ring to be allocated 1:1 for each rx ring, as well as the tx
> completion running in the same softirq.
>
> Upon tx completion, this dedicated tx ring recycles pages without
> unmapping directly back to the original rx ring. In steady state tx/drop
> workload, effectively 0 page allocs/frees will occur.
>
> Signed-off-by: Brenden Blanco 
> ---
>  drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |  15 ++-
>  drivers/net/ethernet/mellanox/mlx4/en_netdev.c  |  19 +++-
>  drivers/net/ethernet/mellanox/mlx4/en_rx.c  |  14 +++
>  drivers/net/ethernet/mellanox/mlx4/en_tx.c  | 126 
> +++-
>  drivers/net/ethernet/mellanox/mlx4/mlx4_en.h|  14 ++-
>  5 files changed, 181 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c 
> b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> index d3d51fa..10642b1 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
> @@ -1694,6 +1694,11 @@ static int mlx4_en_set_rxnfc(struct net_device *dev, 
> struct ethtool_rxnfc *cmd)
> return err;
>  }
>
> +static int mlx4_en_max_tx_channels(struct mlx4_en_priv *priv)
> +{
> +   return (MAX_TX_RINGS - priv->rsv_tx_rings) / MLX4_EN_NUM_UP;
> +}
> +

MAX_TX_RING is a software limitation made to limit netdev real_num_tx
queues for CX3 internal cache utilization,
in your case the netdev doesn't care about xdp_tx rings, the
accounting you added in this patch adds a  lot of
complications and it would be better to have clear separation between
the two types of tx_rings, in terms of the holding/managing data
structure.

I suggest here to leave priv->tx_ring untouched. i.e, don't store the
xdp_tx rings at the end of it, i.e  priv->tx_ring should only reflect
the
netdev real tx queues.

In case of priv->porg is active, we can allocate and store xdp tx ring
per rx ring, this tx ring will be allocated and activated
once the rx ring is created and activated, and store this dedicated tx
ring  in the rx_ring it self.

i.e :
struct mlx4_en_rx_ring {
[...]
struct mlx4_en_tx_ring *xdp_tx;
struct mlx4_en_cq *xdp_tx_cq;
[...]
}

for this the following changes are required.

@ mlx4_en_create_rx_ring
[...] // Create the RX ring

/* create CQ for xdp tx ring */
node = cpu_to_node(i % num_online_cpus());

mlx4_en_create_cq(priv, &rx_ring->xdp_tx_cq, prof->tx_ring_size, i, TX, node)

/* create xdp tx ring */
mlx4_en_create_tx_ring(priv, &rx_ring->xdp_tx, prof->tx_ring_size,
TXBB_SIZE, node, -1)

@mlx4_en_start/stop_port
/* Configure tx cq's and rings */
// You will need to configure xdp tx rings same as priv->rx_ring_num rings

@mlx4_en_poll_tx_cq
This Also will require a new NAPI handler for xdp rings to replace the
following line @mlx4_en_poll_tx_cq
- struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring];
with
+ struct mlx4_en_tx_ring *ring = priv->rx_ring[cq->ring].xdp_tx;

Or just change cq->ring from ring index to the actual ring pointer.

Bottom line, my suggestion also started to look complicated :).. but
still it would look cleaner to separate between netdev rings and xdp
rings.


>  static void mlx4_en_get_channels(struct net_device *dev,
>  struct ethtool_channels *channel)
>  {
> @@ -1705,7 +1710,7 @@ static void mlx4_en_get_channels(struct net_device *dev,
> channel->max_tx = MLX4_EN_MAX_TX_RING_P_UP;
>
> channel->rx_count = priv->rx_ring_num;
> -   channel->tx_count = priv->tx_ring_num / MLX4_EN_NUM_UP;
> +   channel->tx_count = priv->num_tx_rings_p_up;
>  }
>
>  static int mlx4_en_set_channels(struct net_device *dev,
> @@ -1717,7 +1722,7 @@ static int mlx4_en_set_channels(struct net_device *dev,
> int err = 0;
>
> if (channel->other_count || channel->combined_count ||
> -   channel->tx_count > MLX4_EN_MAX_TX_RING_P_UP ||
> +   channel->tx_count > mlx4_en_max_tx_channels(priv) ||
> channel->rx_count > MAX_RX_RINGS ||
> !channel->tx_count || !channel->rx_count)
> return -EINVAL;
> @@ -1731,7 +1736,8 @@ static int mlx4_en_set_channels(struct net_device *dev,
> mlx4_en_free_resources(priv);
>
> priv->num_tx_rings_p_up = channel->tx_count;
> -   priv->tx_ring_num = channel->tx_count * MLX4_EN_NUM_UP;
> +   priv->tx_ring_num = channel->tx_count * MLX4_EN_NUM_UP +
> + 

Re: linux-next: Tree for Jul 13 (net/core/devlink with Tracing)

2016-07-13 Thread Randy Dunlap
On 07/12/16 23:47, Stephen Rothwell wrote:
> Hi all,
> 
> Changes since 20160712:
> 

on x86_64:
(full randconfig file is attached)


  CC  net/core/devlink.o
In file included from ../include/trace/define_trace.h:95:0,
 from ../include/trace/events/devlink.h:51,
 from ../net/core/devlink.c:30:
../include/trace/events/devlink.h: In function 
'trace_event_get_offsets_devlink_hwmsg':
../include/trace/events/devlink.h:25:51: error: dereferencing pointer to 
incomplete type
   __string(owner_name, devlink->dev->driver->owner->name)
   ^
../include/trace/trace_events.h:501:2: note: in definition of macro 
'DECLARE_EVENT_CLASS'
  tstruct;   \
  ^
../include/trace/trace_events.h:63:9: note: in expansion of macro 'PARAMS'
 PARAMS(tstruct), \
 ^
../include/trace/events/devlink.h:16:1: note: in expansion of macro 
'TRACE_EVENT'
 TRACE_EVENT(devlink_hwmsg,
 ^
../include/trace/events/devlink.h:22:2: note: in expansion of macro 
'TP_STRUCT__entry'
  TP_STRUCT__entry(
  ^
../include/trace/trace_events.h:466:29: note: in expansion of macro 
'__dynamic_array'
 #define __string(item, src) __dynamic_array(char, item,   \
 ^
../include/trace/events/devlink.h:25:3: note: in expansion of macro '__string'
   __string(owner_name, devlink->dev->driver->owner->name)
   ^
../include/trace/events/devlink.h:25:51: error: dereferencing pointer to 
incomplete type
   __string(owner_name, devlink->dev->driver->owner->name)
   ^
../include/trace/trace_events.h:501:2: note: in definition of macro 
'DECLARE_EVENT_CLASS'
  tstruct;   \
  ^
../include/trace/trace_events.h:63:9: note: in expansion of macro 'PARAMS'
 PARAMS(tstruct), \
 ^
../include/trace/events/devlink.h:16:1: note: in expansion of macro 
'TRACE_EVENT'
 TRACE_EVENT(devlink_hwmsg,
 ^
../include/trace/events/devlink.h:22:2: note: in expansion of macro 
'TP_STRUCT__entry'
  TP_STRUCT__entry(
  ^
../include/trace/trace_events.h:466:29: note: in expansion of macro 
'__dynamic_array'
 #define __string(item, src) __dynamic_array(char, item,   \
 ^
../include/trace/events/devlink.h:25:3: note: in expansion of macro '__string'
   __string(owner_name, devlink->dev->driver->owner->name)
   ^
In file included from ../include/trace/define_trace.h:95:0,
 from ../include/trace/events/devlink.h:51,
 from ../net/core/devlink.c:30:
../include/trace/events/devlink.h: In function 
'trace_event_raw_event_devlink_hwmsg':
../include/trace/events/devlink.h:35:55: error: dereferencing pointer to 
incomplete type
   __assign_str(owner_name, devlink->dev->driver->owner->name);
   ^
../include/trace/trace_events.h:686:4: note: in definition of macro 
'DECLARE_EVENT_CLASS'
  { assign; }   \
^
../include/trace/trace_events.h:64:9: note: in expansion of macro 'PARAMS'
 PARAMS(assign), \
 ^
../include/trace/events/devlink.h:16:1: note: in expansion of macro 
'TRACE_EVENT'
 TRACE_EVENT(devlink_hwmsg,
 ^
../include/trace/events/devlink.h:32:2: note: in expansion of macro 
'TP_fast_assign'
  TP_fast_assign(
  ^
../include/trace/events/devlink.h:35:3: note: in expansion of macro 
'__assign_str'
   __assign_str(owner_name, devlink->dev->driver->owner->name);
   ^
../include/trace/events/devlink.h:35:55: error: dereferencing pointer to 
incomplete type
   __assign_str(owner_name, devlink->dev->driver->owner->name);
   ^
../include/trace/trace_events.h:686:4: note: in definition of macro 
'DECLARE_EVENT_CLASS'
  { assign; }   \
^
../include/trace/trace_events.h:64:9: note: in expansion of macro 'PARAMS'
 PARAMS(assign), \
 ^
../include/trace/events/devlink.h:16:1: note: in expansion of macro 
'TRACE_EVENT'
 TRACE_EVENT(devlink_hwmsg,
 ^
../include/trace/events/devlink.h:32:2: note: in expansion of macro 
'TP_fast_assign'
  TP_fast_assign(
  ^
../include/trace/events/devlink.h:35:3: note: in expansion of macro 
'__assign_str'
   __assign_str(owner_name, devlink->dev->driver->owner->name);
   ^
In file included from ../include/trace/define_trace.h:96:0,
 from ../include/trace/events/devlink.h:51,
 from ../net/core/devlink.c:30:
../include/trace/events/devlink.h: In function 'perf_trace_devlink_hwmsg':
../include/trace/events/devlink.h:35:55: error: dereferencing pointer to 
incomplete type
   __assign_str(owner_name, devlink->dev->driver->owner->name);
   ^
../include/trace/perf.h:65:4: note: in definition of macro 'DECLARE_EVENT_CLASS'
  { assign; }   \
^
../include/trace/trace_events.h:64:9: note: in expansion of macro 'PARAMS'
 PARAMS(assign), \
 ^
../include/trace

Re: [PATCH v8 00/11] Add driver bpf hook for early packet drop and forwarding

2016-07-13 Thread Tariq Toukan


On 12/07/2016 5:38 PM, Tariq Toukan wrote:
Regression tests for mlx4_en are currently running, results will be 
ready by tomorrow morning.

Functional regression results look fine.


Regards,
Tariq




Re: [PATCH v8 04/11] net/mlx4_en: add support for fast rx drop bpf program

2016-07-13 Thread Brenden Blanco
On Wed, Jul 13, 2016 at 11:27:23AM +, David Laight wrote:
> From: Brenden Blanco
> > Sent: 12 July 2016 08:51
> > Add support for the BPF_PROG_TYPE_XDP hook in mlx4 driver.
> > 
> > In tc/socket bpf programs, helpers linearize skb fragments as needed
> > when the program touches the packet data. However, in the pursuit of
> > speed, XDP programs will not be allowed to use these slower functions,
> > especially if it involves allocating an skb.
> > 
> > Therefore, disallow MTU settings that would produce a multi-fragment
> > packet that XDP programs would fail to access. Future enhancements could
> > be done to increase the allowable MTU.
> 
> Maybe I'm misunderstanding what is going on here...
> But what has the MTU to do with how skb are fragmented?
This is mlx4 specific...depending on the MTU the driver will write data
into 1536, 1536+4096, 1536+4096+4096, etc. fragments.
> 
> If the skb come from a reasonably written USB ethernet interface they could
> easily have arbitrary fragment boundaries (the frames get packed into USB
> buffers).
The XDP program is operating directly on the packet memory, before any
skb has been allocated. The program also expects a continguous memory
region to inspect...it's too expensive to linearize the data like we do
in the tc hook case, that's a feature that costs too much for this type
of low level feature. Therefore, XDP can only be turned on in
combination with a cooperative driver, that's the performance tradeoff
we're imposing here.
> 
> Outbound skb can also have fragments depending on how they are generated.
Sure, but XDP won't run on those. This is an rx-only feature.
> 
>   David


Re: [PATCH net-next 1/3] perf, events: add non-linear data support for raw records

2016-07-13 Thread Daniel Borkmann

Hi Peter,

On 07/13/2016 03:42 PM, Peter Zijlstra wrote:


Ok so the nonlinear thing was it doing _two_ copies, one the regular
__output_copy() on raw->data and second the optional fragment thingy
using __output_custom().

Would something like this work instead?

It does the nonlinear thing and the custom copy function thing but
allows more than 2 fragments and allows each fragment to have a custom
copy.

It doesn't look obviously more expensive; it has the one ->copy branch
extra, but then it doesn't recompute the sizes.


Yes, that would work as well on a quick glance with diff just a bit
bigger, but more generic this way. Do you want me to adapt this into
the first patch?

One question below:


diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1fe22032f228..83e2a83e8db3 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -69,9 +69,18 @@ struct perf_callchain_entry_ctx {
boolcontexts_maxed;
  };

+typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long 
len);
+
+struct perf_raw_frag {
+   struct perf_raw_frag*next;
+   perf_copy_f copy;
+   void*data;
+   u32 size;
+} __packed;
+
  struct perf_raw_record {
+   struct perf_raw_fragfrag;
u32 size;
-   void*data;
  };

  /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fe8d49a56322..f7ad7d65317d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5617,16 +5617,21 @@ void perf_output_sample(struct perf_output_handle 
*handle,
}

if (sample_type & PERF_SAMPLE_RAW) {
-   if (data->raw) {
-   u32 raw_size = data->raw->size;
-   u32 real_size = round_up(raw_size + sizeof(u32),
-sizeof(u64)) - sizeof(u32);
-   u64 zero = 0;
-
-   perf_output_put(handle, real_size);
-   __output_copy(handle, data->raw->data, raw_size);
-   if (real_size - raw_size)
-   __output_copy(handle, &zero, real_size - 
raw_size);
+   struct perf_raw_record *raw = data->raw;
+
+   if (raw) {
+   struct perf_raw_frag *frag = &raw->frag;
+
+   perf_output_put(handle, raw->size);
+   do {
+   if (frag->copy) {
+   __output_custom(handle, frag->copy,
+   frag->data, frag->size);
+   } else {
+   __output_copy(handle, frag->data, 
frag->size);
+   }
+   frag = frag->next;
+   } while (frag);


We still need the zero padding here from above with the computed
raw->size, right?


} else {
struct {
u32 size;
@@ -5751,14 +5756,22 @@ void perf_prepare_sample(struct perf_event_header 
*header,


Thanks,
Daniel


Re: [PATCH] net: ip_finish_output_gso: If skb_gso_network_seglen exceeds MTU, do segmentation even for non IPSKB_FORWARDED skbs

2016-07-13 Thread Shmulik Ladkani
Hi Florian, Hannes,

On Tue, 12 Jul 2016 08:56:56 +0300 Shmulik Ladkani 
 wrote:
> On Sat, 9 Jul 2016 15:22:30 +0200 Florian Westphal  wrote:
> > > 
> > > > What about setting IPCB FORWARD flag in iptunnel_xmit if
> > > > skb->skb_iif != 0... instead?
> 
> I've came up with a suggestion that does not abuse IPSKB_FORWARDED,
> while properly addressing the use case (and similar ones), without
> introducing the cost of entering 'skb_gso_validate_mtu' in the local
> case.
> 
> How about:
> 
> @@ -220,12 +220,15 @@ static int ip_finish_output_gso(struct net *net, struct 
> sock *sk,
>   struct sk_buff *skb, unsigned int mtu)
>  {
>   netdev_features_t features;
> + int local_trusted_gso;
>   struct sk_buff *segs;
>   int ret = 0;
>  
> - /* common case: locally created skb or seglen is <= mtu */
> - if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
> -   skb_gso_validate_mtu(skb, mtu))
> + local_trusted_gso = (IPCB(skb)->flags & IPSKB_FORWARDED) == 0 &&
> + !(skb_shinfo(skb)->gso_type & SKB_GSO_DODGY);
> + /* common case: locally created skb from a trusted gso source or
> +  * seglen is <= mtu */
> + if (local_trusted_gso || skb_gso_validate_mtu(skb, mtu))
>   return ip_finish_output2(net, sk, skb);
>  
>   /* Slowpath -  GSO segment length is exceeding the dst MTU.
> 
> This well addresses the usecase where we have gso-skb arriving from an
> untrusted source, thus its gso_size is out of our control (e.g. tun/tap,
> macvtap, af_packet, xen-netfront...).
> 
> Locally "gso trusted" skbs (the common case) will NOT suffer the
> additional (possibly costy) call to 'skb_gso_validate_mtu'.
> 
> Also, if IPSKB_FORWARDED is true, behavior stays exactly the same.

Any commnets regarding the latest suggestion above?
I'd like to post it as v2 - if it is in the right direction.

It handles the problem of gso_size values which are not in host's
control, it addresses the usecase described, and has a benefit of not
overloading IPSKB_FORWARDED with a new semantic that might be hard to
maintain.

PS:
Also, if we'd like to pinpoint it even further, we can:

local_trusted_gso = (IPCB(skb)->flags & IPSKB_FORWARDED) == 0 &&
(!sk || !(skb_shinfo(skb)->gso_type & SKB_GSO_DODGY));

Which ensures only the following conditions go to the expensive
skb_gso_validate_mtu:

1. IPSKB_FORWARDED is on
2. IPSKB_FORWARDED is off, but sk exists and gso_size is untrusted.
   Meaning: we have a packet arriving from higher layers (sk is set)
   with a gso_size out of host's control.

This fine-tuining leaves standard l2 bridging case (e.g 2x taps bridged)
of a gso skb unaffected, as sk would be NULL.

Many thanks,
Shmulik


[PATCH 0/2] Code style fixes

2016-07-13 Thread Elad Kanfi
From: Elad Kanfi 

Fix all checkpatch warnings and errors, and reuse code

Elad Kanfi (2):
  net: nps_enet: fix coding style issues
  net: nps_enet: code reuse

 drivers/net/ethernet/ezchip/nps_enet.c |   27 ++-
 1 files changed, 14 insertions(+), 13 deletions(-)



Re: [RFC PATCH v3] net: sched: convert qdisc linked list to hashtable

2016-07-13 Thread Jiri Kosina
On Tue, 12 Jul 2016, Cong Wang wrote:

> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index f45929c..0b5c172e 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -52,6 +52,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >
> >  struct netpoll_info;
> >  struct device;
> > @@ -1778,6 +1779,7 @@ struct net_device {
> > unsigned intnum_tx_queues;
> > unsigned intreal_num_tx_queues;
> > struct Qdisc*qdisc;
> > +   DECLARE_HASHTABLE   (qdisc_hash, 4);
> > unsigned long   tx_queue_len;
> > spinlock_t  tx_global_lock;
> > int watchdog_timeo;
> 
> Should it be surrounded by CONFIG_NET_SCHED?
> To save several bytes for !CONFIG_NET_SCHED case.

Makes sense. I'll wait a bit for more feedback (if there is any) before 
including this in potential v4.

Thanks,

-- 
Jiri Kosina
SUSE Labs



Re: [PATCH v3] Marvell phy: add fiber status check and configuration for some phys

2016-07-13 Thread Charles-Antoine Couret
Hi Andrew,

Le 13/07/2016 à 15:26, Andrew Lunn a écrit :
   *
   * Generic status code does not detect Fiber correctly!
 @@ -906,12 +1070,17 @@ static int marvell_read_status(struct phy_device 
 *phydev)
int lpa;
int lpagb;
int status = 0;
 +  int page, fiber;
  
 -  /* Update the link, but return if there
 +  /* Detect and update the link, but return if there
 * was an error */
 -  err = genphy_update_link(phydev);
 -  if (err)
 -  return err;
 +  page = phy_read(phydev, MII_MARVELL_PHY_PAGE);
 +  if (page == MII_M_FIBER)
 +  fiber = 1;
 +  else
 +  fiber = 0;
>>>
>>> This read is expensive, since the MDIO bus is slow. It would be better
>>> just to pass fibre as a parameter.
>>
>> But this function is used for other Marvell's phy, without fiber link for 
>> example.
>> And this function should has only the struct phy_device as parameter.
>>
>> I don't have idea to avoid that, without create a custom function for that 
>> which would be very similar to this function.
>> Or used a phy_device field for that? I think it's awful idea...
> 
> So i would have
> 
> static int marvell_read_status_page(struct phy_device *phydev, int page)
> {}
> 
> basically doing what you have above, but without the read.
> 
> static int marvell_read_status(struct phy_device *phydev)
> {
>   if (phydev->supported & SUPPORTED_FIBRE) {
>   marvell_read_status_page(phydev, MII_M_FIBER);
>   if (phydev->link)
>   return;
> 
>   return marvell_read_status_page(phydev, MII_M_COPPER);
> }

Oh I see. Thank you!

> 
>>> I think it would be better to look for SUPPORTED_FIBRE in
>>> drv->features, rather than have two different functions.
>>>
>>> In fact, i would do that in general, rather than add your _fibre()
>>> functions.
>>
>> So, you suggest to do that in genphy_* functions or create marvell_* 
>> functions with this condition?
>> I'm agree with the second suggestion.
> 
> The second.

I'm working on this.
It's done for _resume and _suspend. It will be done for _status.

But, for aneg or ethtool concerned, I think adding these functions is better.

 +
 +/* marvell_resume_fiber
 + *
 + * Some Marvell's phys have two modes: fiber and copper.
 + * Both need to be resumed
 + */
 +static int marvell_resume_fiber(struct phy_device *phydev)
 +{
 +  int err;
 +
 +  /* Resume the fiber mode first */
 +  err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M_FIBER);
 +  if (err < 0)
 +  goto error;
 +
 +  err = genphy_resume(phydev);
 +  if (err < 0)
 +  goto error;
 +
 +  /* Then, the copper link */
 +  err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M_COPPER);
 +  if (err < 0)
 +  goto error;
 +
 +  return genphy_resume(phydev);
>>>
>>> Should it be resumed twice? Or just once at the end?  Same question
>>> for suspend.
>>
>> I don't understand your question.
> 
> You call genphy_resume(phydev) twice. Once is sufficient.

Yes, but it's normal because each interface could be suspended or resumed 
independently.
genphy_* functions use BMCR register which are identical between fiber and 
copper link. But each link has its own register to change.

Thank you.
Regards.

Charles-Antoine Couret


Re: [PATCH net-next 1/3] perf, events: add non-linear data support for raw records

2016-07-13 Thread Peter Zijlstra

Ok so the nonlinear thing was it doing _two_ copies, one the regular
__output_copy() on raw->data and second the optional fragment thingy
using __output_custom().

Would something like this work instead?

It does the nonlinear thing and the custom copy function thing but
allows more than 2 fragments and allows each fragment to have a custom
copy.

It doesn't look obviously more expensive; it has the one ->copy branch
extra, but then it doesn't recompute the sizes.

---

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1fe22032f228..83e2a83e8db3 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -69,9 +69,18 @@ struct perf_callchain_entry_ctx {
boolcontexts_maxed;
 };
 
+typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long 
len);
+
+struct perf_raw_frag {
+   struct perf_raw_frag*next;
+   perf_copy_f copy;
+   void*data;
+   u32 size;
+} __packed;
+
 struct perf_raw_record {
+   struct perf_raw_fragfrag;
u32 size;
-   void*data;
 };
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fe8d49a56322..f7ad7d65317d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5617,16 +5617,21 @@ void perf_output_sample(struct perf_output_handle 
*handle,
}
 
if (sample_type & PERF_SAMPLE_RAW) {
-   if (data->raw) {
-   u32 raw_size = data->raw->size;
-   u32 real_size = round_up(raw_size + sizeof(u32),
-sizeof(u64)) - sizeof(u32);
-   u64 zero = 0;
-
-   perf_output_put(handle, real_size);
-   __output_copy(handle, data->raw->data, raw_size);
-   if (real_size - raw_size)
-   __output_copy(handle, &zero, real_size - 
raw_size);
+   struct perf_raw_record *raw = data->raw;
+
+   if (raw) {
+   struct perf_raw_frag *frag = &raw->frag;
+
+   perf_output_put(handle, raw->size);
+   do {
+   if (frag->copy) {
+   __output_custom(handle, frag->copy,
+   frag->data, frag->size);
+   } else {
+   __output_copy(handle, frag->data, 
frag->size);
+   }
+   frag = frag->next;
+   } while (frag);
} else {
struct {
u32 size;
@@ -5751,14 +5756,22 @@ void perf_prepare_sample(struct perf_event_header 
*header,
}
 
if (sample_type & PERF_SAMPLE_RAW) {
-   int size = sizeof(u32);
+   struct perf_raw_record *raw = data->raw;
+   int size = sizeof(u64);
 
-   if (data->raw)
-   size += data->raw->size;
-   else
-   size += sizeof(u32);
+   if (raw) {
+   struct perf_raw_frag *frag = &raw->frag;
 
-   header->size += round_up(size, sizeof(u64));
+   size = sizeof(u32);
+   do {
+   size += frag->size;
+   frag = frag->next;
+   } while (frag)
+   size = round_up(size, sizeof(u64));
+   raw->size = size;
+   }
+
+   header->size += size;
}
 
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {



Re: [PATCH v3] Marvell phy: add fiber status check and configuration for some phys

2016-07-13 Thread Andrew Lunn
On Wed, Jul 13, 2016 at 11:14:21AM +0200, Charles-Antoine Couret wrote:


Hi Charles-Antoine
 
> >> +#define LPA_FIBER_1000HALF0x40
> >> +#define LPA_FIBER_1000FULL0x20
> >> +
> >> +#define LPA_PAUSE_FIBER   0x180
> >> +#define LPA_PAUSE_ASYM_FIBER  0x100
> >> +
> >> +#define ADVERTISE_FIBER_1000HALF  0x40
> >> +#define ADVERTISE_FIBER_1000FULL  0x20
> >> +
> >> +#define ADVERTISE_PAUSE_FIBER 0x180
> >> +#define ADVERTISE_PAUSE_ASYM_FIBER0x100
> > 
> > Are these standardised anywhere? If they are following a standard,
> > they should be put into include/uapi/linux/mii.h.

> I don't find any standard about this, I think it should be Marvell specific.

O.K.

> >> +static inline u32 ethtool_adv_to_fiber_adv_t(u32 ethadv)
> >> +{
> >> +  u32 result = 0;
> >> +
> >> +  if (ethadv & ADVERTISED_1000baseT_Half)
> >> +  result |= ADVERTISE_FIBER_1000HALF;
> > 
> > Dumb question: Does 1000baseT_Half even make sense for fibre? Can you
> > do half duplex?  Would that not mean you have a single fibre, both
> > ends are using the same laser frequency, and you are doing some form
> > of CSMA/CD?
> 
> It's strange, I agree, but the register about that exists in the datasheet 
> and the value is not fixed.
> In practice, I don't have a component to test this case correctly.

O.K, just implement it according to the data sheet.
 
> >>   *
> >>   * Generic status code does not detect Fiber correctly!
> >> @@ -906,12 +1070,17 @@ static int marvell_read_status(struct phy_device 
> >> *phydev)
> >>int lpa;
> >>int lpagb;
> >>int status = 0;
> >> +  int page, fiber;
> >>  
> >> -  /* Update the link, but return if there
> >> +  /* Detect and update the link, but return if there
> >> * was an error */
> >> -  err = genphy_update_link(phydev);
> >> -  if (err)
> >> -  return err;
> >> +  page = phy_read(phydev, MII_MARVELL_PHY_PAGE);
> >> +  if (page == MII_M_FIBER)
> >> +  fiber = 1;
> >> +  else
> >> +  fiber = 0;
> > 
> > This read is expensive, since the MDIO bus is slow. It would be better
> > just to pass fibre as a parameter.
> 
> But this function is used for other Marvell's phy, without fiber link for 
> example.
> And this function should has only the struct phy_device as parameter.
> 
> I don't have idea to avoid that, without create a custom function for that 
> which would be very similar to this function.
> Or used a phy_device field for that? I think it's awful idea...

So i would have

static int marvell_read_status_page(struct phy_device *phydev, int page)
{}

basically doing what you have above, but without the read.

static int marvell_read_status(struct phy_device *phydev)
{
if (phydev->supported & SUPPORTED_FIBRE) {
marvell_read_status_page(phydev, MII_M_FIBER);
if (phydev->link)
return;

return marvell_read_status_page(phydev, MII_M_COPPER);
}

> > I think it would be better to look for SUPPORTED_FIBRE in
> > drv->features, rather than have two different functions.
> > 
> > In fact, i would do that in general, rather than add your _fibre()
> > functions.
> 
> So, you suggest to do that in genphy_* functions or create marvell_* 
> functions with this condition?
> I'm agree with the second suggestion.

The second.

> 
> >> +
> >> +/* marvell_resume_fiber
> >> + *
> >> + * Some Marvell's phys have two modes: fiber and copper.
> >> + * Both need to be resumed
> >> + */
> >> +static int marvell_resume_fiber(struct phy_device *phydev)
> >> +{
> >> +  int err;
> >> +
> >> +  /* Resume the fiber mode first */
> >> +  err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M_FIBER);
> >> +  if (err < 0)
> >> +  goto error;
> >> +
> >> +  err = genphy_resume(phydev);
> >> +  if (err < 0)
> >> +  goto error;
> >> +
> >> +  /* Then, the copper link */
> >> +  err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M_COPPER);
> >> +  if (err < 0)
> >> +  goto error;
> >> +
> >> +  return genphy_resume(phydev);
> > 
> > Should it be resumed twice? Or just once at the end?  Same question
> > for suspend.
> 
> I don't understand your question.

You call genphy_resume(phydev) twice. Once is sufficient.

Andrew


Re: [PATCH v2 6/6] dt-bindings: net: bgmac: add bindings documentation for bgmac

2016-07-13 Thread Rob Herring
On Thu, Jul 07, 2016 at 07:08:58PM -0400, Jon Mason wrote:
> Signed-off-by: Jon Mason 
> ---
>  .../devicetree/bindings/net/brcm,amac.txt  | 24 
> ++
>  .../devicetree/bindings/net/brcm,bgmac-nsp.txt | 24 
> ++
>  2 files changed, 48 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/net/brcm,amac.txt
>  create mode 100644 Documentation/devicetree/bindings/net/brcm,bgmac-nsp.txt

Acked-by: Rob Herring 


Re: [PATCH 1/4] mac80211: mesh: flush stations before beacons are stopped

2016-07-13 Thread Bob Copeland
On Wed, Jul 13, 2016 at 10:11:25AM +, Machani, Yaniv wrote:
> > > Some drivers (e.g. wl18xx) expect that the last stage in the 
> > > de-initialization process will be stopping the beacons, similar to ap.
> > > Update ieee80211_stop_mesh() flow accordingly.
> > >
> > How well have you tested that with other drivers?
> > 
> 
> Sorry for the delayed response (I've been out) and thanks for your comments,
> I have tested it with RT3572 as well, and didn't see any issue.
> I'll update the comment to reflect that.

I'll give this a test on ath10k and wcn36xx as they are the ones most
likely to care about ordering.

-- 
Bob Copeland %% http://bobcopeland.com/


Re: [patch v2] net/mlx5: missing error code in esw_create_offloads_fdb_table()

2016-07-13 Thread Matan Barak

On 13/07/2016 16:04, Leon Romanovsky wrote:

On Wed, Jul 13, 2016 at 02:48:44PM +0300, Dan Carpenter wrote:

We accidentally return success when we had intended to return an error
code.

Fixes: 69697b6e2086 ('net/mlx5: E-Switch, Add support for the sriov offloads 
mode')
Signed-off-by: Dan Carpenter 
---
v2: return -ENOTSUPP instead --EINVAL


I'm a little bit confused. Why did you prefer ENOTSUPP over EOPNOTSUPP?


According to [1], it fits our case better - operation is valid and make 
sense, but isn't supported.


[1] https://lists.gnu.org/archive/html/bug-glibc/2002-08/msg00017.html



Thanks.





Re: [PATCH v5 10/11] Documentation: dtb: xgene: Add MDIO node

2016-07-13 Thread Rob Herring
On Thu, Jul 07, 2016 at 04:02:58PM -0700, Iyappan Subramanian wrote:
> Signed-off-by: Iyappan Subramanian 
> Tested-by: Fushen Chen 
> Tested-by: Toan Le 
> Tested-by: Matthias Brugger 
> ---
>  .../devicetree/bindings/net/apm-xgene-mdio.txt | 37 
> ++
>  1 file changed, 37 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/net/apm-xgene-mdio.txt
> 
> diff --git a/Documentation/devicetree/bindings/net/apm-xgene-mdio.txt 
> b/Documentation/devicetree/bindings/net/apm-xgene-mdio.txt
> new file mode 100644
> index 000..0247e70
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/apm-xgene-mdio.txt
> @@ -0,0 +1,37 @@
> +APM X-Gene SoC MDIO node
> +
> +MDIO node is defined to describe on-chip MDIO controller.
> +
> +Required properties:
> + - compatible: Must be "apm,xgene-mdio-rgmii"
> + - #address-cells: Must be <1>.
> + - #size-cells: Must be <0>.
> + - reg: Address and length of the register set
> + - clocks: Reference to the clock entry
> +
> +For the phys on the mdio bus, there must be a node with the following fields:
> + - compatible: PHY identifier.  Please refer ./phy.txt for the format.
> + - reg: The ID number for the phy.
> +
> +Example:
> +
> + mdio: mdio@0x1702 {

Drop the '0x'

With that,

Acked-by: Rob Herring 

> + compatible = "apm,xgene-mdio-rgmii";
> + #address-cells = <1>;
> + #size-cells = <0>;
> + reg = <0x0 0x1702 0x0 0xd100>;
> + clocks = <&menetclk 0>;
> + };
> +
> + /* Board-specific peripheral configurations */
> + &mdio {
> + menetphy: phy@3 {
> + reg = <0x3>;
> + };
> + sgenet0phy: phy@4 {
> + reg = <0x4>;
> + };
> + sgenet1phy: phy@5 {
> + reg = <0x5>;
> + };
> + };
> -- 
> 1.9.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe devicetree" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 1/3] perf, events: add non-linear data support for raw records

2016-07-13 Thread Daniel Borkmann

On 07/13/2016 02:10 PM, Peter Zijlstra wrote:

On Wed, Jul 13, 2016 at 11:24:13AM +0200, Daniel Borkmann wrote:

On 07/13/2016 09:52 AM, Peter Zijlstra wrote:

On Wed, Jul 13, 2016 at 12:36:17AM +0200, Daniel Borkmann wrote:

This patch adds support for non-linear data on raw records. It means
that for such data, the newly introduced __output_custom() helper will
be used instead of __output_copy(). __output_custom() will invoke
whatever custom callback is passed in via struct perf_raw_record_frag
to extract the data into the ring buffer slot.

To keep changes in perf_prepare_sample() and in perf_output_sample()
minimal, size/size_head split was added to perf_raw_record that call
sites fill out, so that two extra tests in fast-path can be avoided.

The few users of raw records are adapted to initialize their size_head
and frag data; no change in behavior for them. Later patch will extend
BPF side with a first user and callback for this facility, future users
could be things like XDP BPF programs (that work on different context
though and would thus have a different callback), etc.


Why? What problem are we solving?


I've tried to summarize it in patch 3/3,


Which is pretty useless if you're staring at this patch.


This currently has 3 issues we'd like to resolve:



i) We need two copies instead of just a single one for the skb data.
The data can be non-linear, see also skb_copy_bits() as an example for
walking/extracting it,


I'm not familiar enough with the network gunk to be able to read that.
But upto skb_walk_frags() it looks entirely linear to me.


Hm, fair enough, there are three parts, skb can have a linear part
which is taken via skb->data, either in its entirety or there can be a
non-linear part appended to that which can consist of pages that are in
shared info section (skb_shinfo(skb) -> frags[], nr_frags members), that
will be linearized, and in addition to that, appended after the frags[]
data there can be further skbs to the 'root' skb that contain fragmented
data, which is all what skb_copy_bits() copies linearized into 'to' buffer.
So depending on the origin of the skb, its structure can be quite different
and skb_copy_bits() covers all the cases generically. Maybe [1] summarizes
it better if you want to familiarize yourself with how skbs work,
although some parts are not up to date anymore.

  [1] http://vger.kernel.org/~davem/skb_data.html


ii) for static verification reasons, the bpf_skb_load_bytes() helper
needs to see a constant size on the passed buffer to make sure BPF
verifier can do its sanity checks on it during verification time, so
just passing in skb->len (or any other non-constant value) wouldn't
work, but changing bpf_skb_load_bytes() is also not the real solution
since we still have two copies we'd like to avoid as well, and



iii) bpf_skb_load_bytes() is just for rather smaller buffers (e.g.
headers) since they need to sit on the limited eBPF stack anyway. The
set would improve the BPF helper to address all 3 at once.


Humm, maybe. Lemme go try and reverse engineer that patch, because I'm
not at all sure wth it's supposed to do, nor am I entirely sure this
clarified things :/


[PATCH -next] stmmac: dwmac-socfpga: remove redundant dev_err call in socfpga_dwmac_parse_data()

2016-07-13 Thread weiyj_lk
From: Wei Yongjun 

There is a error message within devm_ioremap_resource
already, so remove the dev_err call to avoid redundant
error message.

Signed-off-by: Wei Yongjun 
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 17 +++--
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c 
b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
index 3bc1fa2..edd20c3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c
@@ -165,12 +165,8 @@ static int socfpga_dwmac_parse_data(struct socfpga_dwmac 
*dwmac, struct device *
dwmac->splitter_base =
devm_ioremap_resource(dev, &res_splitter);
 
-   if (IS_ERR(dwmac->splitter_base)) {
-   dev_err(dev,
-   "%s: ERROR: failed mapping emac 
splitter\n",
-   __func__);
+   if (IS_ERR(dwmac->splitter_base))
return PTR_ERR(dwmac->splitter_base);
-   }
}
 
index = of_property_match_string(np_sgmii_adapter, "reg-names",
@@ -188,11 +184,8 @@ static int socfpga_dwmac_parse_data(struct socfpga_dwmac 
*dwmac, struct device *
dwmac->pcs.sgmii_adapter_base =
devm_ioremap_resource(dev, &res_sgmii_adapter);
 
-   if (IS_ERR(dwmac->pcs.sgmii_adapter_base)) {
-   dev_err(dev, "%s: failed to mapping adapter\n",
-   __func__);
+   if (IS_ERR(dwmac->pcs.sgmii_adapter_base))
return PTR_ERR(dwmac->pcs.sgmii_adapter_base);
-   }
}
 
index = of_property_match_string(np_sgmii_adapter, "reg-names",
@@ -210,12 +203,8 @@ static int socfpga_dwmac_parse_data(struct socfpga_dwmac 
*dwmac, struct device *
dwmac->pcs.tse_pcs_base =
devm_ioremap_resource(dev, &res_tse_pcs);
 
-   if (IS_ERR(dwmac->pcs.tse_pcs_base)) {
-   dev_err(dev,
-   "%s: ERROR: failed mapping tse control 
port\n",
-   __func__);
+   if (IS_ERR(dwmac->pcs.tse_pcs_base))
return PTR_ERR(dwmac->pcs.tse_pcs_base);
-   }
}
}
dwmac->reg_offset = reg_offset;






[PATCH -next] net: ethernet: bgmac: Remove redundant dev_err call in bgmac_probe()

2016-07-13 Thread weiyj_lk
From: Wei Yongjun 

There is a error message within devm_ioremap_resource
already, so remove the dev_err call to avoid redundant
error message.

Signed-off-by: Wei Yongjun 
---
 drivers/net/ethernet/broadcom/bgmac-platform.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c 
b/drivers/net/ethernet/broadcom/bgmac-platform.c
index 1a2d841..be52f27 100644
--- a/drivers/net/ethernet/broadcom/bgmac-platform.c
+++ b/drivers/net/ethernet/broadcom/bgmac-platform.c
@@ -129,10 +129,8 @@ static int bgmac_probe(struct platform_device *pdev)
}
 
bgmac->plat.base = devm_ioremap_resource(&pdev->dev, regs);
-   if (IS_ERR(bgmac->plat.base)) {
-   dev_err(&pdev->dev, "Unable to map base resource\n");
+   if (IS_ERR(bgmac->plat.base))
return PTR_ERR(bgmac->plat.base);
-   }
 
regs = platform_get_resource_byname(pdev, IORESOURCE_MEM, "idm_base");
if (!regs) {
@@ -141,10 +139,8 @@ static int bgmac_probe(struct platform_device *pdev)
}
 
bgmac->plat.idm_base = devm_ioremap_resource(&pdev->dev, regs);
-   if (IS_ERR(bgmac->plat.idm_base)) {
-   dev_err(&pdev->dev, "Unable to map idm resource\n");
+   if (IS_ERR(bgmac->plat.idm_base))
return PTR_ERR(bgmac->plat.idm_base);
-   }
 
bgmac->read = platform_bgmac_read;
bgmac->write = platform_bgmac_write;






  1   2   >