Re: [RFC PATCH v2 01/10] net: sched: allow qdiscs to handle locking
On 16-07-13 11:19 PM, John Fastabend wrote: > This patch adds a flag for queueing disciplines to indicate the stack > does not need to use the qdisc lock to protect operations. This can > be used to build lockless scheduling algorithms and improving > performance. > > The flag is checked in the tx path and the qdisc lock is only taken > if it is not set. For now use a conditional if statement. Later we > could be more aggressive if it proves worthwhile and use a static key > or wrap this in a likely(). > > Signed-off-by: John Fastabend > --- [...] > @@ -3075,6 +3075,27 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, > struct Qdisc *q, > int rc; > > qdisc_calculate_pkt_len(skb, q); > + > + if (q->flags & TCQ_F_NOLOCK) { > + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { > + __qdisc_drop(skb, &to_free); > + rc = NET_XMIT_DROP; > + } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q)) { > + qdisc_bstats_cpu_update(q, skb); > + __qdisc_run(q); Reviewing these patches now and noticed this qdisc_run() is not needed. > + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) > + __qdisc_run(q); > + rc = NET_XMIT_SUCCESS; > + } else { > + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; > + __qdisc_run(q); > + } > + > + if (unlikely(to_free)) > + kfree_skb_list(to_free); > + return rc; > + } > + [...] Thanks, John
Re: [PATCH] mlxsw: spectrum_router: Return -ENOENT in case of error
Thu, Jul 14, 2016 at 08:18:45AM CEST, christophe.jail...@wanadoo.fr wrote: >'vr' should be a valid pointer here, so returning 'PTR_ERR(vr)' is wrong. >Return an explicit error code (-ENOENT) instead. > This is fo net-next. Fixes: 61c503f976 ("mlxsw: spectrum_router: Implement fib4 add/del switchdev obj ops") >Signed-off-by: Christophe JAILLET Acked-by: Jiri Pirko Thanks.
[RFC PATCH v2 10/10] net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq
The sch_mq qdisc creates a sub-qdisc per tx queue which are then called independently for enqueue and dequeue operations. However statistics are aggregated and pushed up to the "master" qdisc. This patch adds support for any of the sub-qdiscs to be per cpu statistic qdiscs. To handle this case add a check when calculating stats and aggregate the per cpu stats if needed. Also exports __gnet_stats_copy_queue() to use as a helper function. Signed-off-by: John Fastabend --- include/net/gen_stats.h |3 +++ net/core/gen_stats.c|9 + net/sched/sch_mq.c | 25 ++--- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 231e121..5ddc88b 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -47,6 +47,9 @@ int gnet_stats_copy_rate_est(struct gnet_dump *d, int gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue __percpu *cpu_q, struct gnet_stats_queue *q, __u32 qlen); +void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, +const struct gnet_stats_queue __percpu *cpu_q, +const struct gnet_stats_queue *q, __u32 qlen); int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); int gnet_stats_finish_copy(struct gnet_dump *d); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 508e051..a503547 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -254,10 +254,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, } } -static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu, - const struct gnet_stats_queue *q, - __u32 qlen) +void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, +const struct gnet_stats_queue __percpu *cpu, +const struct gnet_stats_queue *q, +__u32 qlen) { if (cpu) { __gnet_stats_copy_queue_cpu(qstats, cpu); @@ -271,6 +271,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, qstats->qlen = qlen; } +EXPORT_SYMBOL(__gnet_stats_copy_queue); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index b943982..f4b5bbb 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -17,6 +17,7 @@ #include #include #include +#include struct mq_sched { struct Qdisc**qdiscs; @@ -107,15 +108,25 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) memset(&sch->qstats, 0, sizeof(sch->qstats)); for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_queue __percpu *cpu_qstats = NULL; + __u32 qlen = 0; + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues+= qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; + + if (qdisc_is_percpu_stats(qdisc)) { + cpu_bstats = qdisc->cpu_bstats; + cpu_qstats = qdisc->cpu_qstats; + } + + qlen = qdisc_qlen_sum(qdisc); + + __gnet_stats_copy_basic(NULL, &sch->bstats, + cpu_bstats, &qdisc->bstats); + __gnet_stats_copy_queue(&sch->qstats, + cpu_qstats, &qdisc->qstats, qlen); + spin_unlock_bh(qdisc_lock(qdisc)); } return 0;
[RFC PATCH v2 09/10] net: sched: helper to sum qlen
Reporting qlen when qlen is per cpu requires aggregating the per cpu counters. This adds a helper routine for this. Signed-off-by: John Fastabend --- include/net/sch_generic.h | 15 +++ 1 file changed, 15 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 149f079..d370fee 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -271,6 +271,21 @@ static inline int qdisc_qlen(const struct Qdisc *q) return q->q.qlen; } +static inline int qdisc_qlen_sum(const struct Qdisc *q) +{ + __u32 qlen = 0; + int i; + + if (q->flags & TCQ_F_NOLOCK) { + for_each_possible_cpu(i) + qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen; + } else { + qlen = q->q.qlen; + } + + return qlen; +} + static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb) { return (struct qdisc_skb_cb *)skb->cb;
[RFC PATCH v2 08/10] net: sched: pfifo_fast use alf_queue
This converts the pfifo_fast qdisc to use the alf_queue enqueue and dequeue routines then sets the NOLOCK bit. This also removes the logic used to pick the next band to dequeue from and instead just checks each alf_queue for packets from top priority to lowest. This might need to be a bit more clever but seems to work for now. Signed-off-by: John Fastabend --- net/sched/sch_generic.c | 131 +++ 1 file changed, 75 insertions(+), 56 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 7dcd066..2ac3eb9 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -555,88 +556,79 @@ static const u8 prio2band[TC_PRIO_MAX + 1] = { /* * Private data for a pfifo_fast scheduler containing: - * - queues for the three band - * - bitmap indicating which of the bands contain skbs + * - rings for priority bands */ struct pfifo_fast_priv { - u32 bitmap; - struct sk_buff_head q[PFIFO_FAST_BANDS]; + struct skb_array q[PFIFO_FAST_BANDS]; }; -/* - * Convert a bitmap to the first band number where an skb is queued, where: - * bitmap=0 means there are no skbs on any band. - * bitmap=1 means there is an skb on band 0. - * bitmap=7 means there are skbs on all 3 bands, etc. - */ -static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0}; - -static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, -int band) +static inline struct skb_array *band2list(struct pfifo_fast_priv *priv, + int band) { - return priv->q + band; + return &priv->q[band]; } static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc, struct sk_buff **to_free) { - if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { - int band = prio2band[skb->priority & TC_PRIO_MAX]; - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - struct sk_buff_head *list = band2list(priv, band); - - priv->bitmap |= (1 << band); - qdisc->q.qlen++; - return __qdisc_enqueue_tail(skb, qdisc, list); - } + int band = prio2band[skb->priority & TC_PRIO_MAX]; + struct pfifo_fast_priv *priv = qdisc_priv(qdisc); + struct skb_array *q = band2list(priv, band); + int err; - return qdisc_drop(skb, qdisc, to_free); + err = skb_array_produce_bh(q, skb); + + if (unlikely(err)) + return qdisc_drop_cpu(skb, qdisc, to_free); + + qdisc_qstats_cpu_qlen_inc(qdisc); + qdisc_qstats_cpu_backlog_inc(qdisc, skb); + return NET_XMIT_SUCCESS; } static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; + struct sk_buff *skb = NULL; + int band; - if (likely(band >= 0)) { - struct sk_buff_head *list = band2list(priv, band); - struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list); + for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) { + struct skb_array *q = band2list(priv, band); - qdisc->q.qlen--; - if (skb_queue_empty(list)) - priv->bitmap &= ~(1 << band); + if (__skb_array_empty(q)) + continue; - return skb; + skb = skb_array_consume_bh(q); } - return NULL; -} - -static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) -{ - struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - int band = bitmap2band[priv->bitmap]; - - if (band >= 0) { - struct sk_buff_head *list = band2list(priv, band); - - return skb_peek(list); + if (likely(skb)) { + qdisc_qstats_cpu_backlog_dec(qdisc, skb); + qdisc_bstats_cpu_update(qdisc, skb); + qdisc_qstats_cpu_qlen_dec(qdisc); } - return NULL; + return skb; } static void pfifo_fast_reset(struct Qdisc *qdisc) { - int prio; + int i, band; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); - for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) - __qdisc_reset_queue(band2list(priv, prio)); + for (band = 0; band < PFIFO_FAST_BANDS; band++) { + struct skb_array *q = band2list(priv, band); + struct sk_buff *skb; - priv->bitmap = 0; - qdisc->qstats.backlog = 0; - qdisc->q.qlen = 0; + while ((skb = skb_array_consume_bh(q)) != NULL) + kfree_skb(skb); + } + + for_each_possible_cpu(i) { + struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats,
[RFC PATCH v2 07/10] net: sched: support skb_bad_tx with lockless qdisc
Similar to how gso is handled skb_bad_tx needs to be per cpu to handle lockless qdisc with multiple writer/producers. Signed-off-by: John Fastabend --- include/net/sch_generic.h |7 +++ net/sched/sch_api.c |5 ++ net/sched/sch_generic.c | 94 + 3 files changed, 97 insertions(+), 9 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 7b140e2..149f079 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -40,6 +40,10 @@ struct gso_cell { struct sk_buff *skb; }; +struct bad_txq_cell { + struct sk_buff *skb; +}; + struct Qdisc { int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, @@ -77,7 +81,8 @@ struct Qdisc { struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; - struct gso_cell __percpu *gso_cpu_skb; + struct gso_cell __percpu *gso_cpu_skb; + struct bad_txq_cell __percpu *skb_bad_txq_cpu; /* * For performance sake on SMP, we put highly modified fields at the end diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index d713052..50088e2 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -970,6 +970,10 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, sch->gso_cpu_skb = alloc_percpu(struct gso_cell); if (!sch->gso_cpu_skb) goto err_out4; + + sch->skb_bad_txq_cpu = alloc_percpu(struct bad_txq_cell); + if (!sch->skb_bad_txq_cpu) + goto err_out4; } if (tca[TCA_STAB]) { @@ -1021,6 +1025,7 @@ err_out4: free_percpu(sch->cpu_bstats); free_percpu(sch->cpu_qstats); free_percpu(sch->gso_cpu_skb); + free_percpu(sch->skb_bad_txq_cpu); /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 8a665dc..7dcd066 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -44,6 +44,42 @@ EXPORT_SYMBOL(default_qdisc_ops); * - ingress filtering is also serialized via qdisc root lock * - updates to tree and tree walking are only done under the rtnl mutex. */ +static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *sch) +{ + if (sch->skb_bad_txq_cpu) { + struct bad_txq_cell *cell = this_cpu_ptr(sch->skb_bad_txq_cpu); + + return cell->skb; + } + + return sch->skb_bad_txq; +} + +static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *sch, +struct sk_buff *skb) +{ + if (sch->skb_bad_txq_cpu) { + struct bad_txq_cell *cell = this_cpu_ptr(sch->skb_bad_txq_cpu); + + cell->skb = skb; + return; + } + + sch->skb_bad_txq = skb; +} + +static inline void qdisc_null_skb_bad_txq(struct Qdisc *sch) +{ + if (sch->skb_bad_txq_cpu) { + struct bad_txq_cell *cell = this_cpu_ptr(sch->skb_bad_txq_cpu); + + cell->skb = NULL; + return; + } + + sch->skb_bad_txq = NULL; +} + static inline struct sk_buff *qdisc_dequeue_gso_skb(struct Qdisc *sch) { if (sch->gso_cpu_skb) @@ -129,9 +165,15 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, if (!nskb) break; if (unlikely(skb_get_queue_mapping(nskb) != mapping)) { - q->skb_bad_txq = nskb; - qdisc_qstats_backlog_inc(q, nskb); - q->q.qlen++; + qdisc_enqueue_skb_bad_txq(q, nskb); + + if (qdisc_is_percpu_stats(q)) { + qdisc_qstats_cpu_backlog_inc(q, nskb); + qdisc_qstats_cpu_qlen_inc(q); + } else { + qdisc_qstats_backlog_inc(q, nskb); + q->q.qlen++; + } break; } skb->next = nskb; @@ -160,7 +202,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, qdisc_null_gso_skb(q); if (qdisc_is_percpu_stats(q)) { - qdisc_qstats_cpu_backlog_inc(q, skb); + qdisc_qstats_cpu_backlog_dec(q, skb); qdisc_qstats_cpu_qlen_dec(q); } else { qdisc_qstats_backlog_dec(q, skb); @@ -171,14 +213,19 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
[RFC PATCH v2 06/10] net: sched: support qdisc_reset on NOLOCK qdisc
The qdisc_reset operation depends on the qdisc lock at the moment to halt any additions to gso_skb and statistics while the list is free'd and the stats zeroed. Without the qdisc lock we can not guarantee another cpu is not in the process of adding a skb to one of the "cells". Here are the two cases we have to handle. case 1: qdisc_graft operation. In this case a "new" qdisc is attached and the 'qdisc_destroy' operation is called on the old qdisc. The destroy operation will wait a rcu grace period and call qdisc_rcu_free(). At which point gso_cpu_skb is free'd along with all stats so no need to zero stats and gso_cpu_skb from the reset operation itself. Because we can not continue to call qdisc_reset before waiting an rcu grace period so that the qdisc is detached from all cpus simply do not call qdisc_reset() at all and let the qdisc_destroy operation clean up the qdisc. Note, a refcnt greater than 1 would cause the destroy operation to be aborted however if this ever happened the reference to the qdisc would be lost and we would have a memory leak. case 2: dev_deactivate sequence. This can come from a user bringing the interface down which causes the gso_skb list to be flushed and the qlen zero'd. At the moment this is protected by the qdisc lock so while we clear the qlen/gso_skb fields we are guaranteed no new skbs are added. For the lockless case though this is not true. To resolve this move the qdisc_reset call after the new qdisc is assigned and a grace period is exercised to ensure no new skbs can be enqueued. Further the RTNL lock is held so we can not get another call to activate the qdisc while the skb lists are being free'd. Finally, fix qdisc_reset to handle the per cpu stats and skb lists. Signed-off-by: John Fastabend --- net/sched/sch_generic.c | 45 +++-- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index f903093..8a665dc 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -739,6 +739,20 @@ void qdisc_reset(struct Qdisc *qdisc) kfree_skb(qdisc->skb_bad_txq); qdisc->skb_bad_txq = NULL; + if (qdisc->gso_cpu_skb) { + int i; + + for_each_possible_cpu(i) { + struct gso_cell *cell; + + cell = per_cpu_ptr(qdisc->gso_cpu_skb, i); + if (cell) { + kfree_skb_list(cell->skb); + cell = NULL; + } + } + } + if (qdisc->gso_skb) { kfree_skb_list(qdisc->gso_skb); qdisc->gso_skb = NULL; @@ -814,10 +828,6 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, root_lock = qdisc_lock(oqdisc); spin_lock_bh(root_lock); - /* Prune old scheduler */ - if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) - qdisc_reset(oqdisc); - /* ... and graft new one */ if (qdisc == NULL) qdisc = &noop_qdisc; @@ -931,7 +941,6 @@ static void dev_deactivate_queue(struct net_device *dev, set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); rcu_assign_pointer(dev_queue->qdisc, qdisc_default); - qdisc_reset(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } @@ -968,6 +977,16 @@ static bool some_qdisc_is_busy(struct net_device *dev) return false; } +static void dev_qdisc_reset(struct net_device *dev, + struct netdev_queue *dev_queue, + void *none) +{ + struct Qdisc *qdisc = dev_queue->qdisc_sleeping; + + if (qdisc) + qdisc_reset(qdisc); +} + /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate @@ -978,7 +997,6 @@ static bool some_qdisc_is_busy(struct net_device *dev) void dev_deactivate_many(struct list_head *head) { struct net_device *dev; - bool sync_needed = false; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, @@ -988,20 +1006,27 @@ void dev_deactivate_many(struct list_head *head) &noop_qdisc); dev_watchdog_down(dev); - sync_needed |= !dev->dismantle; } /* Wait for outstanding qdisc-less dev_queue_xmit calls. * This is avoided if all devices are in dismantle phase : * Caller will call synchronize_net() for us */ - if (sync_needed) - synchronize_net(); + synchronize_net(); /* Wait fo
[RFC PATCH v2 05/10] net: sched: per cpu gso handlers
The net sched infrastructure has a gso ptr that points to skb structs that have failed to be enqueued by the device driver. This can happen when multiple cores try to push a skb onto the same underlying hardware queue resulting in lock contention. This case is handled by a cpu collision handler handle_dev_cpu_collision(). Another case occurs when the stack overruns the drivers low level tx queues capacity. Ideally these should be a rare occurrence in a well-tuned system but they do happen. To handle this in the lockless case use a per cpu gso field to park the skb until the conflict can be resolved. Note at this point the skb has already been popped off the qdisc so it has to be handled by the infrastructure. Signed-off-by: John Fastabend --- include/net/sch_generic.h | 37 +++ net/sched/sch_api.c |7 net/sched/sch_generic.c | 71 ++--- 3 files changed, 110 insertions(+), 5 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f69da4b..7b140e2 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -36,6 +36,10 @@ struct qdisc_size_table { u16 data[]; }; +struct gso_cell { + struct sk_buff *skb; +}; + struct Qdisc { int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, @@ -73,6 +77,8 @@ struct Qdisc { struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; + struct gso_cell __percpu *gso_cpu_skb; + /* * For performance sake on SMP, we put highly modified fields at the end */ @@ -725,6 +731,22 @@ static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch) return sch->gso_skb; } +static inline struct sk_buff *qdisc_peek_dequeued_cpu(struct Qdisc *sch) +{ + struct gso_cell *gso = this_cpu_ptr(sch->gso_cpu_skb); + + if (!gso->skb) { + struct sk_buff *skb = sch->dequeue(sch); + + if (skb) { + gso->skb = skb; + qdisc_qstats_cpu_qlen_inc(sch); + } + } + + return gso->skb; +} + /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) { @@ -741,6 +763,21 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) return skb; } +static inline struct sk_buff *qdisc_dequeue_peeked_skb(struct Qdisc *sch) +{ + struct gso_cell *gso = this_cpu_ptr(sch->gso_cpu_skb); + struct sk_buff *skb = gso->skb; + + if (skb) { + gso->skb = NULL; + qdisc_qstats_cpu_qlen_dec(sch); + } else { + skb = sch->dequeue(sch); + } + + return skb; +} + static inline void __qdisc_reset_queue(struct sk_buff_head *list) { /* diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 12ebde8..d713052 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -966,6 +966,12 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, goto err_out4; } + if (sch->flags & TCQ_F_NOLOCK) { + sch->gso_cpu_skb = alloc_percpu(struct gso_cell); + if (!sch->gso_cpu_skb) + goto err_out4; + } + if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB]); if (IS_ERR(stab)) { @@ -1014,6 +1020,7 @@ err_out: err_out4: free_percpu(sch->cpu_bstats); free_percpu(sch->cpu_qstats); + free_percpu(sch->gso_cpu_skb); /* * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index fc70204..f903093 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -44,8 +44,25 @@ EXPORT_SYMBOL(default_qdisc_ops); * - ingress filtering is also serialized via qdisc root lock * - updates to tree and tree walking are only done under the rtnl mutex. */ +static inline struct sk_buff *qdisc_dequeue_gso_skb(struct Qdisc *sch) +{ + if (sch->gso_cpu_skb) + return (this_cpu_ptr(sch->gso_cpu_skb))->skb; -static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) + return sch->gso_skb; +} + +static inline void qdisc_null_gso_skb(struct Qdisc *sch) +{ + if (sch->gso_cpu_skb) { + (this_cpu_ptr(sch->gso_cpu_skb))->skb = NULL; + return; + } + + sch->gso_skb = NULL; +} + +static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { q->gso_skb = skb; q->qstats.requeues++; @@ -56,6 +73,25 @@ static inline int
[RFC PATCH v2 04/10] net: sched: a dflt qdisc may be used with per cpu stats
Enable dflt qdisc support for per cpu stats before this patch a dflt qdisc was required to use the global statistics qstats and bstats. Signed-off-by: John Fastabend --- net/sched/sch_generic.c | 24 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2c3e23b..fc70204 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -647,18 +647,34 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, struct Qdisc *sch; if (!try_module_get(ops->owner)) - goto errout; + return NULL; sch = qdisc_alloc(dev_queue, ops); if (IS_ERR(sch)) - goto errout; + return NULL; sch->parent = parentid; - if (!ops->init || ops->init(sch, NULL) == 0) + if (!ops->init) return sch; - qdisc_destroy(sch); + if (ops->init(sch, NULL)) + goto errout; + + /* init() may have set percpu flags so init data structures */ + if (qdisc_is_percpu_stats(sch)) { + sch->cpu_bstats = + netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!sch->cpu_bstats) + goto errout; + + sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!sch->cpu_qstats) + goto errout; + } + + return sch; errout: + qdisc_destroy(sch); return NULL; } EXPORT_SYMBOL(qdisc_create_dflt);
[RFC PATCH v2 03/10] net: sched: provide per cpu qstat helpers
The per cpu qstats support was added with per cpu bstat support which is currently used by the ingress qdisc. This patch adds a set of helpers needed to make other qdiscs that use qstats per cpu as well. Signed-off-by: John Fastabend --- include/net/sch_generic.h | 39 +++ 1 file changed, 39 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 354951d..f69da4b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -573,12 +573,43 @@ static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch, sch->qstats.backlog -= qdisc_pkt_len(skb); } +static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch, + const struct sk_buff *skb) +{ + struct gnet_stats_queue *q = this_cpu_ptr(sch->cpu_qstats); + + q->backlog -= qdisc_pkt_len(skb); +} + static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch, const struct sk_buff *skb) { sch->qstats.backlog += qdisc_pkt_len(skb); } +static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch, + const struct sk_buff *skb) +{ + struct gnet_stats_queue *q = this_cpu_ptr(sch->cpu_qstats); + + q->backlog += qdisc_pkt_len(skb); +} + +static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch) +{ + this_cpu_ptr(sch->cpu_qstats)->qlen++; +} + +static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch) +{ + this_cpu_ptr(sch->cpu_qstats)->qlen--; +} + +static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch) +{ + this_cpu_ptr(sch->cpu_qstats)->requeues++; +} + static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count) { sch->qstats.drops += count; @@ -751,6 +782,14 @@ static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch) qdisc_qstats_drop(sch); } +static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch, +struct sk_buff **to_free) +{ + __qdisc_drop(skb, to_free); + qdisc_qstats_cpu_drop(sch); + + return NET_XMIT_DROP; +} static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
Re: [PATCH net-next 1/2] devlink: fix build error for CONFIG_MODULES=n
Wed, Jul 13, 2016 at 11:03:37PM CEST, a...@arndb.de wrote: >A driver calling trace_devlink_hwmsg cannot be built when modules are disabled: > >include/trace/events/devlink.h: In function >'trace_event_get_offsets_devlink_hwmsg': >include/trace/events/devlink.h:25:51: error: dereferencing pointer to >incomplete type 'struct module' > __string(owner_name, devlink->dev->driver->owner->name) > >This changes the code to only print the module name when modules are actually >enabled, otherwise we hardcode the string "built-in". > >Signed-off-by: Arnd Bergmann >Fixes: e5224f0fe2ac ("devlink: add hardware messages tracing facility") >--- > include/trace/events/devlink.h | 8 > 1 file changed, 8 insertions(+) > >diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h >index 333c32ac9bfa..26f92d3c7e9c 100644 >--- a/include/trace/events/devlink.h >+++ b/include/trace/events/devlink.h >@@ -22,7 +22,11 @@ TRACE_EVENT(devlink_hwmsg, > TP_STRUCT__entry( > __string(bus_name, devlink->dev->bus->name) > __string(dev_name, dev_name(devlink->dev)) >+#ifdef CONFIG_MODULES > __string(owner_name, devlink->dev->driver->owner->name) I think would be better to use driver->name. I looks like it is always present. I will do some tests and send a patch. >+#else >+ __string(owner_name, "built-in") >+#endif > __field(bool, incoming) > __field(unsigned long, type) > __dynamic_array(u8, buf, len) >@@ -32,7 +36,11 @@ TRACE_EVENT(devlink_hwmsg, > TP_fast_assign( > __assign_str(bus_name, devlink->dev->bus->name); > __assign_str(dev_name, dev_name(devlink->dev)); >+#ifdef CONFIG_MODULES > __assign_str(owner_name, devlink->dev->driver->owner->name); >+#else >+ __assign_str(owner_name, "built-in"); >+#endif > __entry->incoming = incoming; > __entry->type = type; > memcpy(__get_dynamic_array(buf), buf, len); >-- >2.9.0 >
[RFC PATCH v2 00/10] running qdiscs without qdisc_lock
Hi, I thought I should go ahead and send this series out for comments. Here I allow qdiscs to be run without taking the qdisc lock. As a result statistics, gso skb, tx bad skb and a few other things need to be "safe" to run without locks. It _should_ all be covered here. Although I just noticed I must be missing a dec on the backlog counter somewhere as one of my tests just ended with 0packets but a nonzero bytes counter. Also of note in this series I used the skb_array implementation already in net-next for the tun/tap devices. With this implementation for cases where lots of threads are hitting the same qdisc I see a modest improvement but for cases like mq with pktgen where everything is lined up nicely I see a fairly unpleasant regression. I have a few thoughts on how to resolve this. First if we support bulk_dequeue as an operation on the skb_array this should help vs getting the consumer lock repeatedly. Also we really don't need the HARD_TX_LOCK if we have a core per queue and XPS setup like many multiqueue nics default to. And I need to go back and look at the original alf ring implementation as well to see how it compares I don't recall seeing the mq regression there. Also after the above it might be nice to make all qdiscs support the per cpu statistics and drop non per cpu cases just to simplify the code and all the if/else branching where its not needed. As usual any thoughts, comments, etc are welcome. And I wasn't going to add these numbers just because they come from an untuned system but why not. Here are some initial numbers from pktgen on my development which is a reasonable system (E5-2695) but I didn't do any work to tweak the config so there is still a bunch of debug/hacking options still running. The pktgen command is ./samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -i eth3 -t X -s 64 pfifo_fast original ppslocklessdiff 1 1418168 1269450 -148718 2 1587390 1553408 -33982 4 1084961 1683639 +598678 8 989636 1522723 +533087 12 1014018 1348172 +334154 mq original ppslocklessdiff 1 1442018 1205180-236838 2 2646069 2266095-379974 4 5136200 4269470-866730 8 12 1327567110810909 -2464762 --- John Fastabend (10): net: sched: allow qdiscs to handle locking net: sched: qdisc_qlen for per cpu logic net: sched: provide per cpu qstat helpers net: sched: a dflt qdisc may be used with per cpu stats net: sched: per cpu gso handlers net: sched: support qdisc_reset on NOLOCK qdisc net: sched: support skb_bad_tx with lockless qdisc net: sched: pfifo_fast use alf_queue net: sched: helper to sum qlen net: sched: add support for TCQ_F_NOLOCK subqueues to sch_mq include/net/gen_stats.h |3 include/net/sch_generic.h | 105 net/core/dev.c| 32 +++- net/core/gen_stats.c |9 + net/sched/sch_api.c | 12 + net/sched/sch_generic.c | 385 +++-- net/sched/sch_mq.c| 25 ++- 7 files changed, 467 insertions(+), 104 deletions(-) --
[RFC PATCH v2 02/10] net: sched: qdisc_qlen for per cpu logic
This is a bit interesting because it means sch_direct_xmit will return a positive value which causes the dequeue/xmit cycle to continue only when a specific cpu has a qlen > 0. However checking each cpu for qlen will break performance so its important to note that qdiscs that set the no lock bit need to have some sort of per cpu enqueue/dequeue data structure that maps to the per cpu qlen value. Signed-off-by: John Fastabend --- include/net/sch_generic.h |8 1 file changed, 8 insertions(+) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 3de6a8c..354951d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -247,8 +247,16 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) BUILD_BUG_ON(sizeof(qcb->data) < sz); } +static inline int qdisc_qlen_cpu(const struct Qdisc *q) +{ + return this_cpu_ptr(q->cpu_qstats)->qlen; +} + static inline int qdisc_qlen(const struct Qdisc *q) { + if (q->flags & TCQ_F_NOLOCK) + return qdisc_qlen_cpu(q); + return q->q.qlen; }
[RFC PATCH v2 01/10] net: sched: allow qdiscs to handle locking
This patch adds a flag for queueing disciplines to indicate the stack does not need to use the qdisc lock to protect operations. This can be used to build lockless scheduling algorithms and improving performance. The flag is checked in the tx path and the qdisc lock is only taken if it is not set. For now use a conditional if statement. Later we could be more aggressive if it proves worthwhile and use a static key or wrap this in a likely(). Signed-off-by: John Fastabend --- include/net/sch_generic.h |1 + net/core/dev.c| 32 net/sched/sch_generic.c | 24 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 909aff2..3de6a8c 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -58,6 +58,7 @@ struct Qdisc { #define TCQ_F_NOPARENT 0x40 /* root of its hierarchy : * qdisc_tree_decrease_qlen() should stop. */ +#define TCQ_F_NOLOCK 0x80 /* qdisc does not require locking */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; diff --git a/net/core/dev.c b/net/core/dev.c index b92d63b..f35d449 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3075,6 +3075,27 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, int rc; qdisc_calculate_pkt_len(skb, q); + + if (q->flags & TCQ_F_NOLOCK) { + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + __qdisc_drop(skb, &to_free); + rc = NET_XMIT_DROP; + } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q)) { + qdisc_bstats_cpu_update(q, skb); + __qdisc_run(q); + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) + __qdisc_run(q); + rc = NET_XMIT_SUCCESS; + } else { + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + __qdisc_run(q); + } + + if (unlikely(to_free)) + kfree_skb_list(to_free); + return rc; + } + /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3896,19 +3917,22 @@ static void net_tx_action(struct softirq_action *h) while (head) { struct Qdisc *q = head; - spinlock_t *root_lock; + spinlock_t *root_lock = NULL; head = head->next_sched; - root_lock = qdisc_lock(q); - spin_lock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); - spin_unlock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) + spin_unlock(root_lock); } } } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index e95b67c..2c3e23b 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -170,7 +170,8 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, int ret = NETDEV_TX_BUSY; /* And release qdisc */ - spin_unlock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) + spin_unlock(root_lock); /* Note that we validate skb (GSO, checksum, ...) outside of locks */ if (validate) @@ -183,10 +184,13 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, HARD_TX_UNLOCK(dev, txq); } else { - spin_lock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) + spin_lock(root_lock); return qdisc_qlen(q); } - spin_lock(root_lock); + + if (!(q->flags & TCQ_F_NOLOCK)) + spin_lock(root_lock); if (dev_xmit_complete(ret)) { /* Driver sent out skb successfully or skb was consumed */ @@ -868,14 +872,18 @@ static bool some_qdisc_is_busy(struct net_device *dev) dev_queue = netdev_get_tx_queue(dev, i); q = dev_queue->qdisc_sleeping; - root_lock = qdisc_lock(q); - spin_lock_bh(root_lock); + i
[PATCH] mlxsw: spectrum_router: Return -ENOENT in case of error
'vr' should be a valid pointer here, so returning 'PTR_ERR(vr)' is wrong. Return an explicit error code (-ENOENT) instead. Signed-off-by: Christophe JAILLET --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index e084ea5..81418d6 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -1803,7 +1803,7 @@ int mlxsw_sp_router_fib4_del(struct mlxsw_sp_port *mlxsw_sp_port, sizeof(fib4->dst), fib4->dst_len); if (!fib_entry) { dev_warn(mlxsw_sp->bus_info->dev, "Failed to find FIB4 entry being removed.\n"); - return PTR_ERR(vr); + return -ENOENT; } mlxsw_sp_fib_entry_del(mlxsw_sp_port->mlxsw_sp, fib_entry); mlxsw_sp_fib_entry_remove(vr->fib, fib_entry); -- 2.7.4 --- L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel antivirus Avast. https://www.avast.com/antivirus
Re: [PATCH v11 21/22] IB/hns: Kconfig and Makefile for RoCE module
On Sat, Jul 02, 2016 at 05:39:23PM +0800, Lijun Ou wrote: > This patch added Kconfig and Makefile for building RoCE module. > > Signed-off-by: Wei Hu > Signed-off-by: Nenglong Zhao > Signed-off-by: Lijun Ou > --- > PATCH v11: > hns_roce_icm.o -> hns_roce_hem.o > > PATCH v10/v9/v8/v7/v6/v5: > - No change over the PATCH v4 > > PATCH v4: > This fixes the comments given by Christoph Hellwig over the PATCH v3: > Link: https://lkml.org/lkml/2016/3/22/609 > > PATCH V3: > This fixes the comments given by Leon Romanovsky over the PATCH v2: > Link: https://lkml.org/lkml/2016/3/20/5 > > PATCH v2: > This fixes the comments given by Leon Romanovsky over the PATCH v1: > Link: https://lkml.org/lkml/2016/3/6/94 > Fixes the error tested by kbuild test robot over the PATCH v1: > Link: https://lkml.org/lkml/2016/3/4/343 > > PATCH v1: > - The initial patch > --- > --- > drivers/infiniband/Kconfig | 1 + > drivers/infiniband/hw/Makefile | 1 + > drivers/infiniband/hw/hns/Kconfig | 10 ++ > drivers/infiniband/hw/hns/Makefile | 8 > 4 files changed, 20 insertions(+) > create mode 100644 drivers/infiniband/hw/hns/Kconfig > create mode 100644 drivers/infiniband/hw/hns/Makefile > > diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig > index 2137adf..767f92b 100644 > --- a/drivers/infiniband/Kconfig > +++ b/drivers/infiniband/Kconfig > @@ -74,6 +74,7 @@ source "drivers/infiniband/hw/mlx5/Kconfig" > source "drivers/infiniband/hw/nes/Kconfig" > source "drivers/infiniband/hw/ocrdma/Kconfig" > source "drivers/infiniband/hw/usnic/Kconfig" > +source "drivers/infiniband/hw/hns/Kconfig" > > source "drivers/infiniband/ulp/ipoib/Kconfig" > > diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile > index c0c7cf8..2ad851d 100644 > --- a/drivers/infiniband/hw/Makefile > +++ b/drivers/infiniband/hw/Makefile > @@ -9,3 +9,4 @@ obj-$(CONFIG_INFINIBAND_NES) += nes/ > obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ > obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ > obj-$(CONFIG_INFINIBAND_HFI1)+= hfi1/ > +obj-$(CONFIG_INFINIBAND_HISILICON_HNS) += hns/ --^^^-- There is no need in HISILICON word here. > diff --git a/drivers/infiniband/hw/hns/Kconfig > b/drivers/infiniband/hw/hns/Kconfig > new file mode 100644 > index 000..c47c168 > --- /dev/null > +++ b/drivers/infiniband/hw/hns/Kconfig > @@ -0,0 +1,10 @@ > +config INFINIBAND_HISILICON_HNS > + tristate "Hisilicon Hns ROCE Driver" And you are still inconsistent with the names Hisilicon/HiSilicon/hisilicon/HISILICON/e.t.c., ROCE/roce/RoCE/e.t.c. signature.asc Description: Digital signature
Re: [PATCH v11 00/22] Add HiSilicon RoCE driver
On Thu, Jul 14, 2016 at 11:43:59AM +0800, oulijun wrote: > 在 2016/7/2 17:39, Lijun Ou 写道: > > > Hi, Doug & Sean Hefty & Hal Rosenstock > "Hello, I understand that maintainer is dealing with lots of patches not just > mine. Also, I could not see any further review comments from the community. > I also understand that I should not resend the patch-set again unless I am > sure my patch-set is lost. > I was just wondering what should I do in the current circumstance where my > PATCH" has not activity. > I am not sure if this has been accepted or how much I need to wait to resend > it (if ever). Please guide, I am new to open-source and learning from people > like you. Thanks a lot :) You was asked numerous times to clean the mess in your TO/CC fields. Most of the people have nothing to do with your submission. Understanding who is the RDMA maintainer will help you a lot (hint: it is one of three in your opening sentence). Another request from you which you successfully ignored, was to stop reply with whole email, but reply with relevant part only. Ignoring community rules is a good way to be ignored back. BTW, you don't need to resend patches, please follow after patchwork status https://patchwork.kernel.org/project/linux-rdma/list/?submitter=157841&state=1 > > Thanks > Lijun Ou > > > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html signature.asc Description: Digital signature
Re: [PATCH 0/2] Code style fixes
From: Elad Kanfi Date: Wed, 13 Jul 2016 16:58:05 +0300 > Fix all checkpatch warnings and errors, and reuse code Series applied to net-next, thanks.
Re: [PATCH v11 00/22] Add HiSilicon RoCE driver
在 2016/7/2 17:39, Lijun Ou 写道: > The HiSilicon Network Substem is a long term evolution IP which is > supposed to be used in HiSilicon ICT SoCs. HNS (HiSilicon Network > Sybsystem) also has a hardware support of performing RDMA with > RoCEE. > The driver for HiSilicon RoCEE(RoCE Engine) is a platform driver and > will support mulitple versions of SOCs in future. This version of driver > is meant to support Hip06 SoC(which confirms to RoCEv1 hardware > specifications). > > Changes v10 -> v11: > [1/22]: > 1. modify the print description of chip don't support roce > 2. remove explicit values for enums for patch series > [3/22]: > 3. remove non-essential headers for patch series > 4. add judgement for port_cnt is zero > 5. Keep unified print style for "set mask..." vs. "No usable >..." > 6. modify the MODULE_LICENSE > 7. remove MODULE_ALIAS > [4/22]: > 8. Move this line out of if-else and leave "if (enable)" part only > 9. renaming the meaningful definition to 20 for patch series > 10. delete extern keyword for hns_dsaf_roce_reset function > 11. delete void keyword for hr_dev->hw->reset when driver removed > [5/22]: > 12. remove few unnecessary variables and some lines. > 13. remove the function for one line of code which will be called > once only for patch series > [6/22]: > 14. redesign the method for calculating token_mask' value > [7/22]: > 15. delete hns_roce_status_to_errno > 16. modify the one enum only for all patches > 17. remove the spin_lock in hns_roce_cq_event function > 18. add comment here that 0x10 and 0x11 in hns_roce_event struct > 19. refactor hns_roce_aeq_int function and It has switch in switch > and it is almost 200 LOCs > 20. simplify the lines for err_out_free_pages branch > [8/22]: > 21. remove icm and redesign it for patch series > > Changes v9 -> v10: > 1. delete redundant lines which it is netdevice.h in hns_roce_main.c > 2. adjust the indentation for HNS_ROCE_V1_NUM_ASYNC_EQE > 3. simplify the lines in hns_roce_init_qp_table function > 4. add static type for hns_roce_unregister_device > 5. move the call with hns_roce_unregister_device from the tenth patch to >the eleventh patch in hns_roce_remove function > 6. readjuest the alphabetic order in MAINTAINERS > 7. redesigned the way for getting irq names > 8. avoid the memory leakage because mr->pbl is not free in >hns_roce_mr function > 9. avoid the memory leakage because not kfree table->icm when exception > 10. add the link from LKML as well whose comment in all > > Changes v8 -> v9: > 1. delete the definition of ADDR_SHIFT_n, use literal 12, 32 and 44 and >add comments > 2. use roce_read/roce_write/readl/write instead of roce_readl/roce_writel > 3. delete the print error/debug messages for memory allocation errors > 4. use exit instead of uninit, for example hw->uninit -> hw->exit > 5. use roce_raw_write instead of _raw_writel in eq_set_cons_index > 6. modify the label with underscore > 7. adjust the indentation for the macro definitions in hns_roce_hw_v1.c > 8. simplify some lines in few functions and structures > 9. adjust the alphabetic order in MAINTAINERS > > Changes v7 -> v8: > 1. add a verbs operation named get_port_immutable. It is an >independent patch > 2. add a comment for the definition of ADDR_SHIFT_n, n are 12,32 >and 44 > 3. restructures the code to align with naming convention of the Linux >according to the review of Doug Ledford > 4. modify the state for all .c and .h files > > Changes v6 -> v7: > 1. modify some type of parameter, use bool replace the original type > 2. add the Signed-off-by signatures in the first patch > 3. delete the improper print sentence in hns_roce_create_eq. > > Changes v5 -> v6: > 1. modify the type of obj for unsigned long according the reviews, and >modify the same questions in RoCE module > 2. fix the spelling error > 3. fix the Signed-off-by signatures > > Changes v4 -> v5: > 1. redesign the patchset for RoCE modules in order to split the huge >patch into small patches > 2. fix the directory path for RoCE module. Delete the hisilicon level. > 3. modify the name of roce_v1_hw into roce_hw_v1 > > Changes v3 -> v4: > 1. modify roce.o into hns-roce.o in Makefile and Kconfig file > > Changes v2 -> v3: > 1. modify the formats of RoCE driver code base v2 by the experts >reviewing. also, it used kmalloc_array instead of kmalloc, kcalloc >instead of kzalloc, when refer to memory allocation for array > 2. remove some functions without use and unconnected macros > 3. modify the binding document with RoCE DT base v2 which added >interrupt-names > 4. redesign the port_map and si_map in hns_dsaf_roce_reset > 5. add HiSilicon RoCE driver maintainers introduction in MAINTAINERS >document > > Changes v1 -> v2: > 1. modify the formats of roce driver code by the experts reviewing > 2. modify the bindings file with roce dts. add the attribute named >interrput-names. > 3. modify the way of defining port mode in hns_dsaf_main.c >
[PATCH 2/2] net: nps_enet: code reuse
From: Elad Kanfi Add inline function that checks if there is a pending tx packet. Signed-off-by: Elad Kanfi --- drivers/net/ethernet/ezchip/nps_enet.c | 21 +++-- 1 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c index b182e2a..25faa3d 100644 --- a/drivers/net/ethernet/ezchip/nps_enet.c +++ b/drivers/net/ethernet/ezchip/nps_enet.c @@ -24,6 +24,14 @@ #define DRV_NAME "nps_mgt_enet" +static inline bool nps_enet_is_tx_pending(struct nps_enet_priv *priv) +{ + u32 tx_ctrl_value = nps_enet_reg_get(priv, NPS_ENET_REG_TX_CTL); + u32 tx_ctrl_ct = (tx_ctrl_value & TX_CTL_CT_MASK) >> TX_CTL_CT_SHIFT; + + return (!tx_ctrl_ct && priv->tx_skb); +} + static void nps_enet_clean_rx_fifo(struct net_device *ndev, u32 frame_len) { struct nps_enet_priv *priv = netdev_priv(ndev); @@ -141,12 +149,11 @@ static void nps_enet_tx_handler(struct net_device *ndev) { struct nps_enet_priv *priv = netdev_priv(ndev); u32 tx_ctrl_value = nps_enet_reg_get(priv, NPS_ENET_REG_TX_CTL); - u32 tx_ctrl_ct = (tx_ctrl_value & TX_CTL_CT_MASK) >> TX_CTL_CT_SHIFT; u32 tx_ctrl_et = (tx_ctrl_value & TX_CTL_ET_MASK) >> TX_CTL_ET_SHIFT; u32 tx_ctrl_nt = (tx_ctrl_value & TX_CTL_NT_MASK) >> TX_CTL_NT_SHIFT; /* Check if we got TX */ - if (!priv->tx_skb || tx_ctrl_ct) + if (!nps_enet_is_tx_pending(priv)) return; /* Ack Tx ctrl register */ @@ -184,9 +191,6 @@ static int nps_enet_poll(struct napi_struct *napi, int budget) work_done = nps_enet_rx_handler(ndev); if (work_done < budget) { u32 buf_int_enable_value = 0; - u32 tx_ctrl_value = nps_enet_reg_get(priv, NPS_ENET_REG_TX_CTL); - u32 tx_ctrl_ct = - (tx_ctrl_value & TX_CTL_CT_MASK) >> TX_CTL_CT_SHIFT; napi_complete(napi); @@ -205,8 +209,7 @@ static int nps_enet_poll(struct napi_struct *napi, int budget) * the two code lines below will solve this situation by * re-adding ourselves to the poll list. */ - - if (priv->tx_skb && !tx_ctrl_ct) { + if (nps_enet_is_tx_pending(priv)) { nps_enet_reg_set(priv, NPS_ENET_REG_BUF_INT_ENABLE, 0); napi_reschedule(napi); } @@ -231,11 +234,9 @@ static irqreturn_t nps_enet_irq_handler(s32 irq, void *dev_instance) struct net_device *ndev = dev_instance; struct nps_enet_priv *priv = netdev_priv(ndev); u32 rx_ctrl_value = nps_enet_reg_get(priv, NPS_ENET_REG_RX_CTL); - u32 tx_ctrl_value = nps_enet_reg_get(priv, NPS_ENET_REG_TX_CTL); - u32 tx_ctrl_ct = (tx_ctrl_value & TX_CTL_CT_MASK) >> TX_CTL_CT_SHIFT; u32 rx_ctrl_cr = (rx_ctrl_value & RX_CTL_CR_MASK) >> RX_CTL_CR_SHIFT; - if ((!tx_ctrl_ct && priv->tx_skb) || rx_ctrl_cr) + if (nps_enet_is_tx_pending(priv) || rx_ctrl_cr) if (likely(napi_schedule_prep(&priv->napi))) { nps_enet_reg_set(priv, NPS_ENET_REG_BUF_INT_ENABLE, 0); __napi_schedule(&priv->napi); -- 1.7.1
Re: [PATCH net-next 0/6] sctp: allow GSO frags to access the chunk too
From: Marcelo Ricardo Leitner Date: Wed, 13 Jul 2016 15:08:54 -0300 > Patchset is named after the most important fix in it. First two patches > are preparing the grounds for the 3rd patch. > > After the 3rd, they are not strictly logically related to the patchset, > but I kept them together as they depend on each other. > > More details on patch changelogs. Series applied, thanks.
Re: [PATCH -next] net: ethernet: bgmac: Remove redundant dev_err call in bgmac_probe()
From: weiyj...@163.com Date: Wed, 13 Jul 2016 12:46:57 + > From: Wei Yongjun > > There is a error message within devm_ioremap_resource > already, so remove the dev_err call to avoid redundant > error message. > > Signed-off-by: Wei Yongjun Applied.
Re: [PATCH -next] stmmac: dwmac-socfpga: remove redundant dev_err call in socfpga_dwmac_parse_data()
From: weiyj...@163.com Date: Wed, 13 Jul 2016 12:46:40 + > From: Wei Yongjun > > There is a error message within devm_ioremap_resource > already, so remove the dev_err call to avoid redundant > error message. > > Signed-off-by: Wei Yongjun Applied.
Re: [PATCH net-next] net: vrf: Address comments from last documentation update
From: David Ahern Date: Wed, 13 Jul 2016 18:28:16 -0600 > Comments from Frank Kellerman on last doc update: > - extra whitespace in front of a neigh show command > - convert the brief link example to 'vrf red' > > Signed-off-by: David Ahern Applied.
[PATCH net-next] net: vrf: Address comments from last documentation update
Comments from Frank Kellerman on last doc update: - extra whitespace in front of a neigh show command - convert the brief link example to 'vrf red' Signed-off-by: David Ahern --- Documentation/networking/vrf.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/vrf.txt b/Documentation/networking/vrf.txt index 11a2b99bdbb9..755dab856392 100644 --- a/Documentation/networking/vrf.txt +++ b/Documentation/networking/vrf.txt @@ -189,7 +189,7 @@ older form without it. Or using the brief output: - $ ip -br link show master red + $ ip -br link show vrf red eth1 UP 02:00:00:00:02:02 eth2 UP 02:00:00:00:02:03 eth5 DOWN 02:00:00:00:02:06 @@ -207,8 +207,8 @@ older form without it. 10.2.1.254 dev eth1 lladdr a6:d9:c7:4f:06:23 REACHABLE 10.2.2.254 dev eth2 lladdr 5e:54:01:6a:ee:80 REACHABLE -$ ip -6 neigh show vrf red -2002:1::64 dev eth1 lladdr a6:d9:c7:4f:06:23 REACHABLE + $ ip -6 neigh show vrf red + 2002:1::64 dev eth1 lladdr a6:d9:c7:4f:06:23 REACHABLE 6. Show Addresses for a VRF -- 2.7.4 (Apple Git-66)
[PATCH 1/2] net: ethernet: ll_temac: use phydev from struct net_device
The private structure contain a pointer to phydev, but the structure net_device already contain such pointer. So we can remove the pointer phy in the private structure, and update the driver to use the one contained in struct net_device. Signed-off-by: Philippe Reynes --- drivers/net/ethernet/xilinx/ll_temac.h |1 - drivers/net/ethernet/xilinx/ll_temac_main.c | 37 +++--- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/xilinx/ll_temac.h b/drivers/net/ethernet/xilinx/ll_temac.h index 902457e..7d06e3e 100644 --- a/drivers/net/ethernet/xilinx/ll_temac.h +++ b/drivers/net/ethernet/xilinx/ll_temac.h @@ -332,7 +332,6 @@ struct temac_local { struct device *dev; /* Connection to PHY device */ - struct phy_device *phy_dev; /* Pointer to PHY device */ struct device_node *phy_node; /* MDIO bus data */ diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index 7397087..8d6a178 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -590,7 +590,7 @@ static void temac_device_reset(struct net_device *ndev) static void temac_adjust_link(struct net_device *ndev) { struct temac_local *lp = netdev_priv(ndev); - struct phy_device *phy = lp->phy_dev; + struct phy_device *phy = ndev->phydev; u32 mii_speed; int link_state; @@ -843,19 +843,20 @@ static irqreturn_t ll_temac_rx_irq(int irq, void *_ndev) static int temac_open(struct net_device *ndev) { struct temac_local *lp = netdev_priv(ndev); + struct phy_device *phydev = NULL; int rc; dev_dbg(&ndev->dev, "temac_open()\n"); if (lp->phy_node) { - lp->phy_dev = of_phy_connect(lp->ndev, lp->phy_node, -temac_adjust_link, 0, 0); - if (!lp->phy_dev) { + phydev = of_phy_connect(lp->ndev, lp->phy_node, + temac_adjust_link, 0, 0); + if (!phydev) { dev_err(lp->dev, "of_phy_connect() failed\n"); return -ENODEV; } - phy_start(lp->phy_dev); + phy_start(phydev); } temac_device_reset(ndev); @@ -872,9 +873,8 @@ static int temac_open(struct net_device *ndev) err_rx_irq: free_irq(lp->tx_irq, ndev); err_tx_irq: - if (lp->phy_dev) - phy_disconnect(lp->phy_dev); - lp->phy_dev = NULL; + if (phydev) + phy_disconnect(phydev); dev_err(lp->dev, "request_irq() failed\n"); return rc; } @@ -882,15 +882,15 @@ static int temac_open(struct net_device *ndev) static int temac_stop(struct net_device *ndev) { struct temac_local *lp = netdev_priv(ndev); + struct phy_device *phydev = ndev->phydev; dev_dbg(&ndev->dev, "temac_close()\n"); free_irq(lp->tx_irq, ndev); free_irq(lp->rx_irq, ndev); - if (lp->phy_dev) - phy_disconnect(lp->phy_dev); - lp->phy_dev = NULL; + if (phydev) + phy_disconnect(phydev); temac_dma_bd_release(ndev); @@ -916,15 +916,13 @@ temac_poll_controller(struct net_device *ndev) static int temac_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd) { - struct temac_local *lp = netdev_priv(ndev); - if (!netif_running(ndev)) return -EINVAL; - if (!lp->phy_dev) + if (!ndev->phydev) return -EINVAL; - return phy_mii_ioctl(lp->phy_dev, rq, cmd); + return phy_mii_ioctl(ndev->phydev, rq, cmd); } static const struct net_device_ops temac_netdev_ops = { @@ -971,20 +969,17 @@ static const struct attribute_group temac_attr_group = { /* ethtool support */ static int temac_get_settings(struct net_device *ndev, struct ethtool_cmd *cmd) { - struct temac_local *lp = netdev_priv(ndev); - return phy_ethtool_gset(lp->phy_dev, cmd); + return phy_ethtool_gset(ndev->phydev, cmd); } static int temac_set_settings(struct net_device *ndev, struct ethtool_cmd *cmd) { - struct temac_local *lp = netdev_priv(ndev); - return phy_ethtool_sset(lp->phy_dev, cmd); + return phy_ethtool_sset(ndev->phydev, cmd); } static int temac_nway_reset(struct net_device *ndev) { - struct temac_local *lp = netdev_priv(ndev); - return phy_start_aneg(lp->phy_dev); + return phy_start_aneg(ndev->phydev); } static const struct ethtool_ops temac_ethtool_ops = { -- 1.7.4.4
[PATCH 2/2] net: ethernet: ll_temac: use phy_ethtool_{get|set}_link_ksettings
There are two generics functions phy_ethtool_{get|set}_link_ksettings, so we can use them instead of defining the same code in the driver. Signed-off-by: Philippe Reynes --- drivers/net/ethernet/xilinx/ll_temac_main.c | 14 ++ 1 files changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c index 8d6a178..a9bd665 100644 --- a/drivers/net/ethernet/xilinx/ll_temac_main.c +++ b/drivers/net/ethernet/xilinx/ll_temac_main.c @@ -967,27 +967,17 @@ static const struct attribute_group temac_attr_group = { }; /* ethtool support */ -static int temac_get_settings(struct net_device *ndev, struct ethtool_cmd *cmd) -{ - return phy_ethtool_gset(ndev->phydev, cmd); -} - -static int temac_set_settings(struct net_device *ndev, struct ethtool_cmd *cmd) -{ - return phy_ethtool_sset(ndev->phydev, cmd); -} - static int temac_nway_reset(struct net_device *ndev) { return phy_start_aneg(ndev->phydev); } static const struct ethtool_ops temac_ethtool_ops = { - .get_settings = temac_get_settings, - .set_settings = temac_set_settings, .nway_reset = temac_nway_reset, .get_link = ethtool_op_get_link, .get_ts_info = ethtool_op_get_ts_info, + .get_link_ksettings = phy_ethtool_get_link_ksettings, + .set_link_ksettings = phy_ethtool_set_link_ksettings, }; static int temac_of_probe(struct platform_device *op) -- 1.7.4.4
Re: pull request: bluetooth-next 2016-07-13
From: Johan Hedberg Date: Wed, 13 Jul 2016 11:25:40 +0300 > Here's our main bluetooth-next pull request for the 4.8 kernel: > > - Fixes and cleanups in 802.15.4 and 6LoWPAN code > - Fix out of bounds issue in btmrvl driver > - Fixes to Bluetooth socket recvmsg return values > - Use crypto_cipher_encrypt_one() instead of crypto_skcipher > - Cleanup of Bluetooth connection sysfs interface > - New Authentication failure reson code for Disconnected mgmt event > - New USB IDs for Atheros, Qualcomm and Intel Bluetooth controllers > > Please let me know if there are any issues pulling. Thanks. Pulled, thanks Johan.
Re: [net-next PATCH 2/3] pktgen: add sample script pktgen_sample05_flow_per_thread.sh
On Wed, Jul 13, 2016 at 10:06:10PM +0200, Jesper Dangaard Brouer wrote: > This pktgen sample script is useful for scalability testing a > receiver. The script will simply generate one flow per > thread (option -t N) using the thread number as part of the > source IP-address. > > The single flow sample (pktgen_sample03_burst_single_flow.sh) > have become quite popular, but it is important that developers > also make sure to benchmark scalability of multiple receive > queues. > > Signed-off-by: Jesper Dangaard Brouer > --- > samples/pktgen/pktgen_sample05_flow_per_thread.sh | 81 > + > 1 file changed, 81 insertions(+) ... > +# Setup source IP-addresses based on thread number > +pg_set $dev "src_min 198.18.$((thread+1)).1" > +pg_set $dev "src_max 198.18.$((thread+1)).1" I have similar script that uses udp_src_min/max to change port number, since port is easier to match on the target host and we don't use ipv4 ;) but this script is also good improvement. Thanks! Acked-by: Alexei Starovoitov
Re: [PATCH] rndis_host: Set random MAC for ZTE MF910
Kristian Evensen writes: > From: Kristian Evensen > > All ZTE MF910 mifis, at least on some revisions, export the same MAC > address (36:4b:50:b7:ef:da). Check for this MAC address and set a random > MAC if detected. > > Also, changed the memcpy() to ether_addr_copy(), as pointed out by > checkpatch. > > Signed-off-by: Kristian Evensen > --- > drivers/net/usb/rndis_host.c | 9 - > 1 file changed, 8 insertions(+), 1 deletion(-) > > diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c > index 524a47a281..85bdbdf 100644 > --- a/drivers/net/usb/rndis_host.c > +++ b/drivers/net/usb/rndis_host.c > @@ -295,6 +295,9 @@ static const struct net_device_ops rndis_netdev_ops = { > .ndo_validate_addr = eth_validate_addr, > }; > > +/* well-known buggy ZTE MF910 MAC address */ > +static const u8 buggy_zte_addr[ETH_ALEN] = {0x36, 0x4b, 0x50, 0xb7, 0xef, > 0xda}; > + > int > generic_rndis_bind(struct usbnet *dev, struct usb_interface *intf, int flags) > { > @@ -428,7 +431,11 @@ generic_rndis_bind(struct usbnet *dev, struct > usb_interface *intf, int flags) > dev_err(&intf->dev, "rndis get ethaddr, %d\n", retval); > goto halt_fail_and_release; > } > - memcpy(net->dev_addr, bp, ETH_ALEN); > + > + if (ether_addr_equal(bp, buggy_zte_addr)) > + eth_hw_addr_random(net); > + else > + ether_addr_copy(net->dev_addr, bp); > > /* set a nonzero filter to enable data transfers */ > memset(u.set, 0, sizeof *u.set); Or how about the more generic?: if (bp[0] & 0x02) eth_hw_addr_random(net); else ether_addr_copy(net->dev_addr, bp); That would catch similar screwups from other vendors too. Bjørn
Re: [PATCH RFC 3/3] ARM64: dts: meson-gxbb: use the new meson8b DWMAC glue
Michael Turquette writes: > Hi Martin, > > Quoting Martin Blumenstingl (2016-06-27 04:33:49) >> On Mon, Jun 27, 2016 at 12:44 PM, Martin Blumenstingl >> wrote: >> > On Mon, Jun 27, 2016 at 11:24 AM, Carlo Caione wrote: >> >> A syscon is a region containing a set of miscellaneous registers used >> >> for several reasons by several devices [1]. It this case there is really >> >> no need to define a new syscon node since those two registers are only >> >> used by your driver. >> > I can easily change it back if that's the way to go. >> > Before I do that: could you please confirm that "mp2_clk_out" (which >> > is controlled by PRG_ETH0/offset 0x0 bits 7-9) is not something which >> > has to be available through the common clk framework? >> there was just an IRC discussion with Carlo on this topic: >> We tried to find whether PRG_ETH0 is used to actually configure >> "mp2_clk_out". Carlo brought up that it could also be the case that >> the ethernet block simply needs to be informed about the rate of the >> mp2_clk_out (which is *probably* the "mpll2" clock). >> >> I'm adding Michael Turquette to this mail, maybe you can comment on this >> topic. >> >> If it turns out that the etthernet block just has to know about the >> clock rate then we have two tasks: >> 1. identify why the mpll2 rate returns 0 on my GXBB device > > This is in progress, but turns out it doesn't matter for Ethernet. Bit 4 > in PRG_ETHERNET_ADDR0 control a mux clock inside of the Ethernet > controller. > > A value of 0x0 selects fclk_div2 and a value of 0x1 selects mp2_clk_out. > The bootloader programs in sets the mux to zero, or fclk_div2 as the > input clock (which runs at 1GHz). > >> 2. change my patch so the new DWMAC glue gets a reference to the mpll2 >> clock and then use "clk_get_rate(mpll2) / (250 * 100)" to >> configure the PRG_ETH0_MP2_CLK bits. > > Hmm, I'm not sure about that part. Bits 7-9 is a divider that further > divides the clock signal selected by bit 4. This is set to 0x4, which > means we divide the 1GHz fclk_div2 down to 250MHz, which seems to be the > expected value coming out of this divider. > > I haven't looked further to see if there is a further programmable > divider to divide 250MHz down to 50MHz, or (more likely) there is simply > a fixed-factor divide-by-5 that results in the 50MHz rate consumed by > the PHY. > > Modeling this all in the mmc driver makes sense. So we would have: > > struct clk_mux clk_m250_sel -> > struct clk_divider clk_m250_div -> > struct clk_fixed_factor enet_phy_clk There's also bit 10: "Generate 25MHz clock for PHY" (which is set to 1 by the bootloaders on P200 and odroidc2) This suggests that it might not be a fixed-factor divide-by-5 but a choice between a divide-by-5 and a divide-by-10 for the PHY clock. Kevin
[PATCH net-next 1/2] devlink: fix build error for CONFIG_MODULES=n
A driver calling trace_devlink_hwmsg cannot be built when modules are disabled: include/trace/events/devlink.h: In function 'trace_event_get_offsets_devlink_hwmsg': include/trace/events/devlink.h:25:51: error: dereferencing pointer to incomplete type 'struct module' __string(owner_name, devlink->dev->driver->owner->name) This changes the code to only print the module name when modules are actually enabled, otherwise we hardcode the string "built-in". Signed-off-by: Arnd Bergmann Fixes: e5224f0fe2ac ("devlink: add hardware messages tracing facility") --- include/trace/events/devlink.h | 8 1 file changed, 8 insertions(+) diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h index 333c32ac9bfa..26f92d3c7e9c 100644 --- a/include/trace/events/devlink.h +++ b/include/trace/events/devlink.h @@ -22,7 +22,11 @@ TRACE_EVENT(devlink_hwmsg, TP_STRUCT__entry( __string(bus_name, devlink->dev->bus->name) __string(dev_name, dev_name(devlink->dev)) +#ifdef CONFIG_MODULES __string(owner_name, devlink->dev->driver->owner->name) +#else + __string(owner_name, "built-in") +#endif __field(bool, incoming) __field(unsigned long, type) __dynamic_array(u8, buf, len) @@ -32,7 +36,11 @@ TRACE_EVENT(devlink_hwmsg, TP_fast_assign( __assign_str(bus_name, devlink->dev->bus->name); __assign_str(dev_name, dev_name(devlink->dev)); +#ifdef CONFIG_MODULES __assign_str(owner_name, devlink->dev->driver->owner->name); +#else + __assign_str(owner_name, "built-in"); +#endif __entry->incoming = incoming; __entry->type = type; memcpy(__get_dynamic_array(buf), buf, len); -- 2.9.0
[PATCH net-next 2/2] devlink: fix trace format string
Including devlink.h on ARM and probably other 32-bit architectures results in a harmless warning: In file included from ../include/trace/define_trace.h:95:0, from ../include/trace/events/devlink.h:51, from ../net/core/devlink.c:30: include/trace/events/devlink.h: In function 'trace_raw_output_devlink_hwmsg': include/trace/events/devlink.h:42:12: error: format '%lu' expects argument of type 'long unsigned int', but argument 10 has type 'size_t {aka unsigned int}' [-Werror=format=] The correct format string for 'size_t' is %zu, not %lu, this works on all architectures. Signed-off-by: Arnd Bergmann Fixes: e5224f0fe2ac ("devlink: add hardware messages tracing facility") --- include/trace/events/devlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h index 26f92d3c7e9c..4b75a6f986fc 100644 --- a/include/trace/events/devlink.h +++ b/include/trace/events/devlink.h @@ -47,7 +47,7 @@ TRACE_EVENT(devlink_hwmsg, __entry->len = len; ), - TP_printk("bus_name=%s dev_name=%s owner_name=%s incoming=%d type=%lu buf=0x[%*phD] len=%lu", + TP_printk("bus_name=%s dev_name=%s owner_name=%s incoming=%d type=%lu buf=0x[%*phD] len=%zu", __get_str(bus_name), __get_str(dev_name), __get_str(owner_name), __entry->incoming, __entry->type, (int) __entry->len, __get_dynamic_array(buf), __entry->len) -- 2.9.0
Re: [PATCH RFC 3/3] ARM64: dts: meson-gxbb: use the new meson8b DWMAC glue
Hi Martin, Quoting Martin Blumenstingl (2016-06-27 04:33:49) > On Mon, Jun 27, 2016 at 12:44 PM, Martin Blumenstingl > wrote: > > On Mon, Jun 27, 2016 at 11:24 AM, Carlo Caione wrote: > >> A syscon is a region containing a set of miscellaneous registers used > >> for several reasons by several devices [1]. It this case there is really > >> no need to define a new syscon node since those two registers are only > >> used by your driver. > > I can easily change it back if that's the way to go. > > Before I do that: could you please confirm that "mp2_clk_out" (which > > is controlled by PRG_ETH0/offset 0x0 bits 7-9) is not something which > > has to be available through the common clk framework? > there was just an IRC discussion with Carlo on this topic: > We tried to find whether PRG_ETH0 is used to actually configure > "mp2_clk_out". Carlo brought up that it could also be the case that > the ethernet block simply needs to be informed about the rate of the > mp2_clk_out (which is *probably* the "mpll2" clock). > > I'm adding Michael Turquette to this mail, maybe you can comment on this > topic. > > If it turns out that the etthernet block just has to know about the > clock rate then we have two tasks: > 1. identify why the mpll2 rate returns 0 on my GXBB device This is in progress, but turns out it doesn't matter for Ethernet. Bit 4 in PRG_ETHERNET_ADDR0 control a mux clock inside of the Ethernet controller. A value of 0x0 selects fclk_div2 and a value of 0x1 selects mp2_clk_out. The bootloader programs in sets the mux to zero, or fclk_div2 as the input clock (which runs at 1GHz). > 2. change my patch so the new DWMAC glue gets a reference to the mpll2 > clock and then use "clk_get_rate(mpll2) / (250 * 100)" to > configure the PRG_ETH0_MP2_CLK bits. Hmm, I'm not sure about that part. Bits 7-9 is a divider that further divides the clock signal selected by bit 4. This is set to 0x4, which means we divide the 1GHz fclk_div2 down to 250MHz, which seems to be the expected value coming out of this divider. I haven't looked further to see if there is a further programmable divider to divide 250MHz down to 50MHz, or (more likely) there is simply a fixed-factor divide-by-5 that results in the 50MHz rate consumed by the PHY. Modeling this all in the mmc driver makes sense. So we would have: struct clk_mux clk_m250_sel -> struct clk_divider clk_m250_div -> struct clk_fixed_factor enet_phy_clk I don't know what the name should be for that last one, I just chose enet_phy_clk since it illustrates the point. The updated docs suggest that clk_m250_{sel,div} might be reasonable names for the mux and divider. Kevin and I just got this info from AmLogic earlier today. The next rev of documentation should correct these register definitions. Regards, Mike
Re: 4.6.3 panic on nf_ct_delete (nf_conntrack)
On 2016-07-13 23:21, Florian Westphal wrote: nuclear...@nuclearcat.com wrote: Workload: pppoe server, 5k users on ppp interfaces. No actual SNAT/DNAT, but using connmark and REDIRECT [176412.990104] general protection fault: [#1] SMP I assume that you did not see this before. What was the last kernel version where you did not run into this? Might help to narrow things down. Difficult to say, because it was triggered also on 4.5.3 at 10 Jun, while i was running this kernel since May 10, and never had such issue before. Maybe some new traffic pattern caused this, or because interfaces saturated now, and might reach full bandwidth (800Mbps in bursts might reach 1G, and traffic will be dropped?). Here is panic from 4.5.3: [85867.255619] general protection fault: [#1] SMP [85867.255939] Modules linked in: cls_fw act_police cls_u32 sch_ingress sch_sfq sch_htb netconsole configfs coretemp nf_nat_pptp nf_nat_proto_gre nf_conntrack_pptp nf_conntrack_proto_gre pppoe pppox ppp_generic slhc tun xt_REDIRECT nf_nat_redirect xt_TCPMSS ipt_REJECT nf_reject_ipv4 xt_set ts_bm xt_string xt_connmark xt_DSCP xt_mark xt_tcpudp ip_set_hash_net ip_set_hash_ip ip_set nfnetlink iptable_mangle iptable_filter iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack ip_tables x_tables 8021q garp mrp stp llc [85867.263194] CPU: 7 PID: 0 Comm: swapper/7 Not tainted 4.5.3-build-0100 #4 [85867.263397] Hardware name: HP ProLiant DL320e Gen8 v2, BIOS P80 04/02/2015 [85867.263598] task: 880435584680 ti: 8804355a8000 task.ti: 8804355a8000 [85867.263936] RIP: 0010:[] [] nf_ct_delete+0x1a/0x1dc [nf_conntrack] [85867.264343] RSP: 0018:8804474e3e80 EFLAGS: 00010282 [85867.264545] RAX: 8804021b3738 RBX: 8100 RCX: dead0200 [85867.264749] RDX: RSI: RDI: ffa00504021b36b0 [85867.264950] RBP: 8804474e3ec8 R08: 8804474e3f08 R09: [85867.265151] R10: 820090c0 R11: 0002 R12: ffa00504021b36b0 [85867.265351] R13: R14: R15: 820090c8 [85867.265553] FS: () GS:8804474e() knlGS: [85867.265892] CS: 0010 DS: ES: CR0: 80050033 [85867.266092] CR2: 7fb170542dc8 CR3: 0200a000 CR4: 001406e0 [85867.266295] Stack: [85867.266490] 8804474e3ec0 810f996a 880435584680 8804474edc40 [85867.267057] 8100 a003d2b1 00fa 8804355ac000 [85867.267624] 820090c8 8804474e3ed8 a003d2be 8804474e3ef8 [85867.268192] Call Trace: [85867.268392] [85867.268456] [] ? hrtimer_forward+0xd5/0xeb [85867.268857] [] ? nf_ct_delete+0x1dc/0x1dc [nf_conntrack] [85867.269062] [] death_by_timeout+0xd/0xf [nf_conntrack] [85867.269265] [] call_timer_fn.isra.26+0x17/0x6d [85867.269468] [] run_timer_softirq+0x176/0x197 [85867.269672] [] __do_softirq+0xb9/0x1a9 [85867.269873] [] irq_exit+0x37/0x7c [85867.270077] [] smp_apic_timer_interrupt+0x3d/0x48 [85867.270282] [] apic_timer_interrupt+0x7c/0x90 [85867.270484] [85867.270546] [] ? mwait_idle+0x64/0x7a [85867.270943] [] arch_cpu_idle+0xa/0xc [85867.271144] [] default_idle_call+0x27/0x29 [85867.271345] [] cpu_startup_entry+0x11f/0x1c9 [85867.271548] [] start_secondary+0xf1/0xf4 [85867.271750] Code: e8 35 60 08 e1 58 5b 41 5c 41 5d 41 5e 41 5f 5d c3 55 48 89 e5 41 57 41 56 41 55 41 54 41 89 f5 53 49 89 fc 41 89 d6 48 83 ec 20 8b 9f c8 00 00 00 48 85 db 74 20 0f b7 43 1c 66 85 c0 74 17 [85867.275937] RIP [] nf_ct_delete+0x1a/0x1dc [nf_conntrack] [85867.276200] RSP [85867.276423] ---[ end trace 7be551057bff38cd ]--- [85867.285767] Kernel panic - not syncing: Fatal exception in interrupt [85867.285973] Kernel Offset: disabled [85867.319076] Rebooting in 5 seconds..
Re: [PATCH 1/1] tracing, bpf: Implement function bpf_probe_write
On Wed, 13 Jul 2016, Alexei Starovoitov wrote: > On Wed, Jul 13, 2016 at 03:36:11AM -0700, Sargun Dhillon wrote: >> Provides BPF programs, attached to kprobes a safe way to write to >> memory referenced by probes. This is done by making probe_kernel_write >> accessible to bpf functions via the bpf_probe_write helper. > > not quite :) > >> Signed-off-by: Sargun Dhillon >> --- >> include/uapi/linux/bpf.h | 3 +++ >> kernel/trace/bpf_trace.c | 20 >> samples/bpf/bpf_helpers.h | 2 ++ >> 3 files changed, 25 insertions(+) >> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index 406459b..355b565 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -313,6 +313,9 @@ enum bpf_func_id { >> */ >> BPF_FUNC_skb_get_tunnel_opt, >> BPF_FUNC_skb_set_tunnel_opt, >> + >> + BPF_FUNC_probe_write, /* int bpf_probe_write(void *dst, void *src, >> int size) */ >> + > > the patch is against some old kernel. > Please always make the patch against net-next tree and cc netdev list. > Sorry, I did this against Linus's tree, not net-next. Will fix. >> +static u64 bpf_probe_write(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) >> +{ >> + void *dst = (void *) (long) r1; >> + void *unsafe_ptr = (void *) (long) r2; >> + int size = (int) r3; >> + >> + return probe_kernel_write(dst, unsafe_ptr, size); >> +} > > the patch is whitepsace mangled. Please see > Documentation/networking/netdev-FAQ.txt Also will fix. > > the main issue though that we cannot simply allow bpf to do probe_write, > since it may crash the kernel. > What might be ok is to allow writing into memory of current > user space process only. This way bpf prog will keep kernel safety guarantees, > yet it will be able to modify user process memory when necessary. > Since bpf+tracing is root only, it doesn't pose security risk. > > Doesn't probe_write prevent you from writing to protected memory and generate an EFAULT? Or are you worried about the situation where a bpf program writes to some other chunk of kernel memory, or writes bad data to said kernel memory? I guess when I meant "safe" -- it's safer than allowing arbitrary memcpy. I don't see a good way to ensure safety otherwise as we don't know which registers point to memory that it's reasonable for probes to manipulate. It's not like skb_store_bytes where we can check the pointer going in is the same pointer that's referenced, and with a super restricted datatype. Perhaps, it would be a good idea to describe an example where I used this: #include #include #include int trace_inet_stream_connect(struct pt_regs *ctx) { if (!PT_REGS_PARM2(ctx)) { return 0; } struct sockaddr uaddr = {}; struct sockaddr_in *addr_in; bpf_probe_read(&uaddr, sizeof(struct sockaddr), (void *)PT_REGS_PARM2(ctx)); if (uaddr.sa_family == AF_INET) { // Simple cast causes LLVM weirdness addr_in = &uaddr; char fmt[] = "Connecting on port: %d\n"; bpf_trace_printk(fmt, sizeof(fmt), ntohs(addr_in->sin_port)); if (ntohs(addr_in->sin_port) == 80) { addr_in->sin_port = htons(443); bpf_probe_write((void *)PT_REGS_PARM2(ctx), &uaddr, sizeof(uaddr)); } } return 0; }; There are two reasons I want to do this: 1) Debugging - sometimes, it makes sense to divert a program's syscalls in order to allow for better debugging 2) Network Functions - I wrote a load balancer which intercepts inet_stream_connect & tcp_set_state. We can manipulate the destination address as neccessary at connect time. This also has the nice side effect that getpeername() returns the real IP that a server is connected to, and the performance is far better than doing "network load balancing" (I realize this is a total hack, better approaches would be appreciated) If we allowed manipulation of the current task's user memory by exposing copy_to_user, that could also work if I attach the probe to sys_connect, I could overwrite the address there before it gets copied into kernel space, but that could lead to its own weirdness. Any ideas?
Re: 4.6.3 panic on nf_ct_delete (nf_conntrack)
nuclear...@nuclearcat.com wrote: > Workload: pppoe server, 5k users on ppp interfaces. No actual SNAT/DNAT, but > using connmark and REDIRECT > > [176412.990104] general protection fault: [#1] > SMP I assume that you did not see this before. What was the last kernel version where you did not run into this? Might help to narrow things down.
4.6.3 panic on nf_ct_delete (nf_conntrack)
Workload: pppoe server, 5k users on ppp interfaces. No actual SNAT/DNAT, but using connmark and REDIRECT [176412.990104] general protection fault: [#1] SMP [176412.990424] Modules linked in: sch_pie cls_fw act_police cls_u32 sch_ingress sch_sfq sch_htb netconsole [176412.991427] configfs coretemp nf_nat_pptp nf_nat_proto_gre nf_conntrack_pptp nf_conntrack_proto_gre pppoe pppox [176412.992571] ppp_generic slhc [176412.993218] tun xt_REDIRECT nf_nat_redirect xt_TCPMSS ipt_REJECT nf_reject_ipv4 xt_set ts_bm xt_string xt_connmark xt_DSCP xt_mark xt_tcpudp ip_set_hash_net ip_set_hash_ip ip_set nfnetlink iptable_mangle iptable_filter iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack ip_tables x_tables 8021q [176412.996208] garp mrp stp llc [176412.996834] CPU: 5 PID: 0 Comm: swapper/5 Not tainted 4.6.3-build-0105 #4 [176412.997037] Hardware name: HP ProLiant DL320e Gen8 v2, BIOS P80 04/02/2015 [176412.997241] task: 88043558af00 ti: 8804355a task.ti: 8804355a [176412.997580] RIP: 0010:[] [] nf_ct_delete+0x26/0x1dc [nf_conntrack] [176412.997985] RSP: 0018:8804474a3e80 EFLAGS: 00010282 [176412.998187] RAX: 880428bc0c90 RBX: ffac050402505080 RCX: dead0200 [176412.998524] RDX: RSI: RDI: 880428bc0c08 [176412.998865] RBP: 8804474a3ec8 R08: 8804474a3f08 R09: [176412.999204] R10: 820050c0 R11: 049a R12: 880428bc0c08 [176412.999545] R13: R14: R15: 820050c8 [176412.999885] FS: () GS:8804474a() knlGS: [176413.000226] CS: 0010 DS: ES: CR0: 80050033 [176413.000427] CR2: 7f1dc4960100 CR3: 02006000 CR4: 001406e0 [176413.000767] Stack: [176413.000963] 8804474a3ec0 810fb036 88043558af07 8804474adcc0 [176413.001534] 8100 a003d2ad 00a1 8804355a4000 [176413.002097] 820050c8 8804474a3ed8 a003d2ba 8804474a3ef8 [176413.002666] Call Trace: [176413.002862] [176413.002926] [] ? hrtimer_forward+0xd5/0xeb [176413.003322] [] ? nf_ct_delete+0x1dc/0x1dc [nf_conntrack] [176413.003525] [] death_by_timeout+0xd/0xf [nf_conntrack] [176413.003727] [] call_timer_fn.isra.26+0x17/0x6d [176413.003931] [] run_timer_softirq+0x176/0x197 [176413.004134] [] __do_softirq+0xb9/0x1a9 [176413.004333] [] irq_exit+0x37/0x7c [176413.004533] [] smp_apic_timer_interrupt+0x3d/0x48 [176413.004734] [] apic_timer_interrupt+0x7c/0x90 [176413.004935] [176413.004997] [] ? mwait_idle+0x68/0x7e [176413.005391] [] arch_cpu_idle+0xa/0xc [176413.005592] [] default_idle_call+0x27/0x29 [176413.005791] [] cpu_startup_entry+0x115/0x1bf [176413.005993] [] start_secondary+0xf1/0xf4 [176413.006193] Code: 5e 41 5f 5d c3 55 48 89 e5 41 57 41 56 41 55 41 54 41 89 f5 53 49 89 fc 41 89 d6 48 83 ec 20 48 8b 9f c8 00 00 00 48 85 db 74 20 b7 43 1c 66 85 c0 74 17 48 01 c3 74 12 48 83 7b 08 00 75 0b [176413.010382] RIP [] nf_ct_delete+0x26/0x1dc [nf_conntrack] [176413.010643] RSP [176413.010855] ---[ end trace cf1060fc5087293e ]--- [176413.018573] Kernel panic - not syncing: Fatal exception in interrupt [176413.018781] Kernel Offset: disabled [176413.046284] ERST: [Firmware Warn]: Firmware does not respond in time. [176413.050041] Rebooting in 5 seconds..
[net-next PATCH 0/3] pktgen samples: new scripts and removing older samples
This patchset is adding some pktgen sample scripts that I've been using for a while[1], and they seams to relevant for more people. Patchset also remove some of the older style pktgen samples. [1] https://github.com/netoptimizer/network-testing/tree/master/pktgen --- Jesper Dangaard Brouer (3): pktgen: add sample script pktgen_sample04_many_flows.sh pktgen: add sample script pktgen_sample05_flow_per_thread.sh pktgen: remove sample script pktgen.conf-1-1-rdos samples/pktgen/pktgen.conf-1-1-flows | 67 --- samples/pktgen/pktgen.conf-1-1-rdos | 64 -- samples/pktgen/pktgen_sample04_many_flows.sh | 93 + samples/pktgen/pktgen_sample05_flow_per_thread.sh | 81 ++ 4 files changed, 174 insertions(+), 131 deletions(-) delete mode 100755 samples/pktgen/pktgen.conf-1-1-flows delete mode 100755 samples/pktgen/pktgen.conf-1-1-rdos create mode 100755 samples/pktgen/pktgen_sample04_many_flows.sh create mode 100755 samples/pktgen/pktgen_sample05_flow_per_thread.sh --
[net-next PATCH 2/3] pktgen: add sample script pktgen_sample05_flow_per_thread.sh
This pktgen sample script is useful for scalability testing a receiver. The script will simply generate one flow per thread (option -t N) using the thread number as part of the source IP-address. The single flow sample (pktgen_sample03_burst_single_flow.sh) have become quite popular, but it is important that developers also make sure to benchmark scalability of multiple receive queues. Signed-off-by: Jesper Dangaard Brouer --- samples/pktgen/pktgen_sample05_flow_per_thread.sh | 81 + 1 file changed, 81 insertions(+) create mode 100755 samples/pktgen/pktgen_sample05_flow_per_thread.sh diff --git a/samples/pktgen/pktgen_sample05_flow_per_thread.sh b/samples/pktgen/pktgen_sample05_flow_per_thread.sh new file mode 100755 index ..32ad818e2829 --- /dev/null +++ b/samples/pktgen/pktgen_sample05_flow_per_thread.sh @@ -0,0 +1,81 @@ +#!/bin/bash +# +# Script will generate one flow per thread (-t N) +# - Same destination IP +# - Fake source IPs for each flow (fixed based on thread number) +# +# Useful for scale testing on receiver, to see whether silo'ing flows +# works and scales. For optimal scalability (on receiver) each +# separate-flow should not access shared variables/data. This script +# helps magnify any of these scaling issues by overloading the receiver. +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" + +# Parameter parsing via include +source ${basedir}/parameters.sh +# Set some default params, if they didn't get set +[ -z "$DEST_IP" ] && DEST_IP="198.18.0.42" +[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" +[ -z "$BURST" ] && BURST=32 + + +# Base Config +DELAY="0" # Zero means max speed +COUNT="0" # Zero means indefinitely + +# General cleanup everything since last run +pg_ctrl "reset" + +# Threads are specified with parameter -t value in $THREADS +for ((thread = 0; thread < $THREADS; thread++)); do +dev=${DEV}@${thread} + +# Add remove all other devices and add_device $dev to thread +pg_thread $thread "rem_device_all" +pg_thread $thread "add_device" $dev + +# Base config +pg_set $dev "flag QUEUE_MAP_CPU" +pg_set $dev "count $COUNT" +pg_set $dev "clone_skb $CLONE_SKB" +pg_set $dev "pkt_size $PKT_SIZE" +pg_set $dev "delay $DELAY" +pg_set $dev "flag NO_TIMESTAMP" + +# Single destination +pg_set $dev "dst_mac $DST_MAC" +pg_set $dev "dst $DEST_IP" + +# Setup source IP-addresses based on thread number +pg_set $dev "src_min 198.18.$((thread+1)).1" +pg_set $dev "src_max 198.18.$((thread+1)).1" + +# Setup burst, for easy testing -b 0 disable bursting +# (internally in pktgen default and minimum burst=1) +if [[ ${BURST} -ne 0 ]]; then + pg_set $dev "burst $BURST" +else + info "$dev: Not using burst" +fi + +done + +# Run if user hits control-c +function print_result() { +# Print results +for ((thread = 0; thread < $THREADS; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" +done +} +# trap keyboard interrupt (Ctrl-C) +trap true SIGINT + +echo "Running... ctrl^C to stop" >&2 +pg_ctrl "start" + +print_result
[PATCH v3 3/3] mac80211: mesh: fixed HT ies in beacon template
The HT capab info field inside the HT capab IE of the mesh beacon is incorrect (in the case of 20MHz channel width). To fix this driver will check configuration from cfg and will build it accordingly. Signed-off-by: Meirav Kama Signed-off-by: Yaniv Machani --- V3 - Fixes redundant spaces,empty lines and added FALLTHROUGH note. net/mac80211/mesh.c | 33 - net/mac80211/util.c | 3 --- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 9214bc1..6a67049 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -422,6 +422,7 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata, enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband; u8 *pos; + u16 cap; sband = local->hw.wiphy->bands[band]; if (!sband->ht_cap.ht_supported || @@ -430,11 +431,41 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) return 0; + /* determine capability flags */ + cap = sband->ht_cap.cap; + + /* if channel width is 20MHz - configure HT capab accordingly*/ + if (sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20) { + cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; + cap &= ~IEEE80211_HT_CAP_DSSSCCK40; + } + + /* set SM PS mode properly */ + cap &= ~IEEE80211_HT_CAP_SM_PS; + switch (sdata->smps_mode) { + case IEEE80211_SMPS_AUTOMATIC: + case IEEE80211_SMPS_NUM_MODES: + WARN_ON(1); + /* FALLTHROUGH */ + case IEEE80211_SMPS_OFF: + cap |= WLAN_HT_CAP_SM_PS_DISABLED << + IEEE80211_HT_CAP_SM_PS_SHIFT; + break; + case IEEE80211_SMPS_STATIC: + cap |= WLAN_HT_CAP_SM_PS_STATIC << + IEEE80211_HT_CAP_SM_PS_SHIFT; + break; + case IEEE80211_SMPS_DYNAMIC: + cap |= WLAN_HT_CAP_SM_PS_DYNAMIC << + IEEE80211_HT_CAP_SM_PS_SHIFT; + break; + } + if (skb_tailroom(skb) < 2 + sizeof(struct ieee80211_ht_cap)) return -ENOMEM; pos = skb_put(skb, 2 + sizeof(struct ieee80211_ht_cap)); - ieee80211_ie_build_ht_cap(pos, &sband->ht_cap, sband->ht_cap.cap); + ieee80211_ie_build_ht_cap(pos, &sband->ht_cap, cap); return 0; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 42bf0b6..5375a82 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -2349,10 +2349,7 @@ u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, ht_oper->operation_mode = cpu_to_le16(prot_mode); ht_oper->stbc_param = 0x; - /* It seems that Basic MCS set and Supported MCS set - are identical for the first 10 bytes */ memset(&ht_oper->basic_set, 0, 16); - memcpy(&ht_oper->basic_set, &ht_cap->mcs, 10); return pos + sizeof(struct ieee80211_ht_operation); } -- 2.9.0
[net-next PATCH 1/3] pktgen: add sample script pktgen_sample04_many_flows.sh
Adding a pktgen sample script that demonstrates how to use pktgen for simulating flows. Script will generate a certain number of concurrent flows ($FLOWS) and each flow will contain $FLOWLEN packets, which will be send back-to-back, before switching to a new flow, due to flag FLOW_SEQ. This script obsoletes the old sample script 'pktgen.conf-1-1-flows', which is removed. Signed-off-by: Jesper Dangaard Brouer --- samples/pktgen/pktgen.conf-1-1-flows | 67 --- samples/pktgen/pktgen_sample04_many_flows.sh | 93 ++ 2 files changed, 93 insertions(+), 67 deletions(-) delete mode 100755 samples/pktgen/pktgen.conf-1-1-flows create mode 100755 samples/pktgen/pktgen_sample04_many_flows.sh diff --git a/samples/pktgen/pktgen.conf-1-1-flows b/samples/pktgen/pktgen.conf-1-1-flows deleted file mode 100755 index 081749c9707d.. --- a/samples/pktgen/pktgen.conf-1-1-flows +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash - -#modprobe pktgen - - -function pgset() { -local result - -echo $1 > $PGDEV - -result=`cat $PGDEV | fgrep "Result: OK:"` -if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: -fi -} - -# Config Start Here --- - - -# thread config -# Each CPU has its own thread. One CPU example. We add eth1. - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - - -# device config -# delay 0 -# We need to do alloc for every skb since we cannot clone here. - -CLONE_SKB="clone_skb 0" -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 60" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 1000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - # Random address with in the min-max range - pgset "flag IPDST_RND" - pgset "dst_min 10.0.0.0" - pgset "dst_max 10.255.255.255" - - # 8k Concurrent flows at 4 pkts - pgset "flows 8192" - pgset "flowlen 4" - - pgset "dst_mac 00:04:23:08:91:dc" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1 diff --git a/samples/pktgen/pktgen_sample04_many_flows.sh b/samples/pktgen/pktgen_sample04_many_flows.sh new file mode 100755 index ..f60412e445bb --- /dev/null +++ b/samples/pktgen/pktgen_sample04_many_flows.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# +# Script example for many flows testing +# +# Number of simultaneous flows limited by variable $FLOWS +# and number of packets per flow controlled by variable $FLOWLEN +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" + +# Parameter parsing via include +source ${basedir}/parameters.sh +# Set some default params, if they didn't get set +[ -z "$DEST_IP" ] && DEST_IP="198.18.0.42" +[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" + +# NOTICE: Script specific settings +# === +# Limiting the number of concurrent flows ($FLOWS) +# and also set how many packets each flow contains ($FLOWLEN) +# +[ -z "$FLOWS" ] && FLOWS="8000" +[ -z "$FLOWLEN" ] && FLOWLEN="10" + +# Base Config +DELAY="0" # Zero means max speed +COUNT="0" # Zero means indefinitely + +if [[ -n "$BURST" ]]; then +err 1 "Bursting not supported for this mode" +fi + +# General cleanup everything since last run +pg_ctrl "reset" + +# Threads are specified with parameter -t value in $THREADS +for ((thread = 0; thread < $THREADS; thread++)); do +dev=${DEV}@${thread} + +# Add remove all other devices and add_device $dev to thread +pg_thread $thread "rem_device_all" +pg_thread $thread "add_device" $dev + +# Base config +pg_set $dev "flag QUEUE_MAP_CPU" +pg_set $dev "count $COUNT" +pg_set $dev "clone_skb $CLONE_SKB" +pg_set $dev "pkt_size $PKT_SIZE" +pg_set $dev "delay $DELAY" +pg_set $dev "flag NO_TIMESTAMP" + +# Single destination +pg_set $dev "dst_mac $DST_MAC" +pg_set $dev "dst $DEST_IP" + +# Randomize source IP-addresses +pg_set $dev "flag IPSRC_RND" +pg_set $dev "src_min 198.18.0.0" +pg_set $dev "src_max 198.19.255.255" + +# Limit number of flows (max 65535) +pg_set $dev "flows $FLOWS" +# +# How many packets a flow will send, before flow "entry" is +# re-generated/setup. +pg_set $dev "flowlen $FLOWLEN" +# +# Flag FLOW_SEQ will cause $FLOWLEN packets from the same flow +# being send back-to-back, before next flow is selected +# incrementally. This helps lookup caches, and is more realistic. +# +pg_set $dev "flag FLOW_SEQ" + +done + +# Run if user hits control-c +function print_result() { +# Print results +for ((thread = 0; thread < $THREADS; thread++)); do + dev=${DEV}@${thread} + echo "Device: $dev" +
[net-next PATCH 3/3] pktgen: remove sample script pktgen.conf-1-1-rdos
Removing the pktgen sample script pktgen.conf-1-1-rdos, because it does not contain anything that is not covered by the other and newer style sample scripts. Signed-off-by: Jesper Dangaard Brouer --- samples/pktgen/pktgen.conf-1-1-rdos | 64 --- 1 file changed, 64 deletions(-) delete mode 100755 samples/pktgen/pktgen.conf-1-1-rdos diff --git a/samples/pktgen/pktgen.conf-1-1-rdos b/samples/pktgen/pktgen.conf-1-1-rdos deleted file mode 100755 index c7553be49b80.. --- a/samples/pktgen/pktgen.conf-1-1-rdos +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash - -#modprobe pktgen - - -function pgset() { -local result - -echo $1 > $PGDEV - -result=`cat $PGDEV | fgrep "Result: OK:"` -if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: -fi -} - -# Config Start Here --- - - -# thread config -# Each CPU has its own thread. One CPU example. We add eth1. - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - - -# device config -# delay 0 - -# We need to do alloc for every skb since we cannot clone here. - -CLONE_SKB="clone_skb 0" -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 60" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 1000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - # Random address with in the min-max range - pgset "flag IPDST_RND" - pgset "dst_min 10.0.0.0" - pgset "dst_max 10.255.255.255" - - pgset "dst_mac 00:04:23:08:91:dc" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1
RE: [PATCH 1/4] mac80211: mesh: flush stations before beacons are stopped
On Wed, Jul 13, 2016 at 16:33:38, Bob Copeland wrote: > linux- wirel...@vger.kernel.org; netdev@vger.kernel.org; Hahn, Maital > Subject: Re: [PATCH 1/4] mac80211: mesh: flush stations before beacons > are stopped > > On Wed, Jul 13, 2016 at 10:11:25AM +, Machani, Yaniv wrote: > > > > Some drivers (e.g. wl18xx) expect that the last stage in the > > > > de-initialization process will be stopping the beacons, similar to ap. > > > > Update ieee80211_stop_mesh() flow accordingly. > > > > > > > How well have you tested that with other drivers? > > > > > > > Sorry for the delayed response (I've been out) and thanks for your > > comments, I have tested it with RT3572 as well, and didn't see any issue. > > I'll update the comment to reflect that. > > I'll give this a test on ath10k and wcn36xx as they are the ones most > likely to care about ordering. > Thank you, Yaniv > -- > Bob Copeland %% http://bobcopeland.com/
Re: linux-next: Tree for Jul 13 (net/core/devlink with Tracing)
On Wed, 13 Jul 2016 08:12:16 -0700 Randy Dunlap wrote: > On 07/12/16 23:47, Stephen Rothwell wrote: > > Hi all, > > > > Changes since 20160712: > > > > on x86_64: > (full randconfig file is attached) > > > CC net/core/devlink.o > In file included from ../include/trace/define_trace.h:95:0, > from ../include/trace/events/devlink.h:51, > from ../net/core/devlink.c:30: > ../include/trace/events/devlink.h: In function > 'trace_event_get_offsets_devlink_hwmsg': > ../include/trace/events/devlink.h:25:51: error: dereferencing pointer to > incomplete type >__string(owner_name, devlink->dev->driver->owner->name) [snip the rest] When I remove all references to the owner_name (and that crazy dereferencing above), it compiles fine. There must be something funky with that devlink->dev->driver->owner->name part. -- Steve
Re: [PATCH v7 09/11] net/mlx4_en: add xdp forwarding and data write support
On Wed, Jul 13, 2016 at 8:30 PM, Brenden Blanco wrote: > On Wed, Jul 13, 2016 at 06:25:28PM +0300, Saeed Mahameed wrote: >> On Tue, Jul 12, 2016 at 12:29 AM, Brenden Blanco >> wrote: [...] >> >> MAX_TX_RING is a software limitation made to limit netdev real_num_tx >> queues for CX3 internal cache utilization, >> in your case the netdev doesn't care about xdp_tx rings, the >> accounting you added in this patch adds a lot of >> complications and it would be better to have clear separation between >> the two types of tx_rings, in terms of the holding/managing data >> structure. >> >> I suggest here to leave priv->tx_ring untouched. i.e, don't store the >> xdp_tx rings at the end of it, i.e priv->tx_ring should only reflect >> the >> netdev real tx queues. >> >> In case of priv->porg is active, we can allocate and store xdp tx ring >> per rx ring, this tx ring will be allocated and activated >> once the rx ring is created and activated, and store this dedicated tx >> ring in the rx_ring it self. >> >> i.e : >> struct mlx4_en_rx_ring { >> [...] >> struct mlx4_en_tx_ring *xdp_tx; >> struct mlx4_en_cq *xdp_tx_cq; >> [...] >> } >> >> for this the following changes are required. >> >> @ mlx4_en_create_rx_ring >> [...] // Create the RX ring >> >> /* create CQ for xdp tx ring */ >> node = cpu_to_node(i % num_online_cpus()); >> >> mlx4_en_create_cq(priv, &rx_ring->xdp_tx_cq, prof->tx_ring_size, i, TX, node) >> >> /* create xdp tx ring */ >> mlx4_en_create_tx_ring(priv, &rx_ring->xdp_tx, prof->tx_ring_size, >> TXBB_SIZE, node, -1) >> >> @mlx4_en_start/stop_port >> /* Configure tx cq's and rings */ >> // You will need to configure xdp tx rings same as priv->rx_ring_num rings >> >> @mlx4_en_poll_tx_cq >> This Also will require a new NAPI handler for xdp rings to replace the >> following line @mlx4_en_poll_tx_cq >> - struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; >> with >> + struct mlx4_en_tx_ring *ring = priv->rx_ring[cq->ring].xdp_tx; >> >> Or just change cq->ring from ring index to the actual ring pointer. >> >> Bottom line, my suggestion also started to look complicated :).. but >> still it would look cleaner to separate between netdev rings and xdp >> rings. >> > I considered this at first too, but it seemed the worse option to me at > the time. There would be a lot of copy/paste as well as new code to > review. We can start from a small refactoring patch that moves code around and extracts the needed helper functions. But it is up to you and Tariq. it is really non trivial to follow the logic of rsv_tx_rings and tx_ring_num accounting. >> >> If in this napi cycle we had at least one packet that went through >> XDP_PASS (mlx4_en_xmit_frame) you must hit doorbell here, > You mean XDP_TX? yes >> otherwise if no packet will be forwarded later, this packet will never >> be really sent and it will wait in HW forever. >> >> The idea is correct to hit the doorbell only at the end of >> mlx4_en_process_rx_cq cycle to batch tx descriptors and send them in >> one batch, > Up to a budget of 8 >> but you must hit doorbell at the end of the cycle. you can't just >> assume more RX packets will come in the future to hit the doorbell for >> you. > I don't assume that. If you look at the code, either: > xmit_frame rings the doorbell, in which case doorbell_pending <- 0 > or > xmit_frame doesn't ring the doorbell, in which case doorbell_pending++ > So how is a packet left in the ring unannounced? Ooh, now i see, yeap the logic is good. >> >> This condition will be checked always even for non XDP rings and when >> XDP is not enabled. >> can't we just figure a way not to have this for non XDP rings ? >> other than having a separate napi handler i don't see a way to do this. >> on the other hand, new NAPI handler would introduce a lot of code >> duplication. > Yes I considered a separate napi handler, but again that would be more > code duplication than it's worth, IMO. Yeah, I agree. >> >> > + >> > + skb = tx_info->skb; >> > + >> > /* We do not touch skb here, so prefetch skb->users location >> > * to speedup consume_skb() >> > */ >> > @@ -476,6 +494,9 @@ static bool mlx4_en_process_tx_cq(struct net_device >> > *dev, >> > ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb; >> > ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped; >> > >> > + if (ring->recycle_ring) >> > + return done < budget; >> > + >> > netdev_tx_completed_queue(ring->tx_queue, packets, bytes); >> > >> > /* Wakeup Tx queue if this stopped, and ring is not full. >> > @@ -1055,3 +1076,106 @@ tx_drop: >> > return NETDEV_TX_OK; >> > } >> > >> > +netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_alloc *frame, >> > + struct net_device *dev, unsigned int length, >> > + int tx_ind, int *doorbell_pending) >> > +{ >> > + struct mlx4_en_priv *priv = netdev_priv(dev); >> > + union mlx4_wqe_qpn_vlan
Re: [PATCH v2 net-next] net: vrf: Documentation update
From: David Ahern Date: Wed, 13 Jul 2016 06:19:37 -0600 > Update vrf documentation for changes made to 4.4 - 4.8 kernels > and iproute2 support for vrf keyword. > > Signed-off-by: David Ahern > --- > v2 > - comments from Frank Kellerman: extra whitespace in front of a neigh show > command. Convert the brief link example to 'vrf red'. Oops, I applied v1 already, please send me a relative patch with the changes. Thanks.
Re: [PATCH net 0/2] limit sk_filter trim to payload
From: Willem de Bruijn Date: Tue, 12 Jul 2016 18:18:55 -0400 > From: Willem de Bruijn > > Sockets can apply a filter to incoming packets to drop or trim them. > Fix two codepaths that call skb_pull/__skb_pull after sk_filter > without checking for packet length. > > Reading beyond skb->tail after trimming happens in more codepaths, but > safety of reading in the linear segment is based on minimum allocation > size (MAX_HEADER, GRO_MAX_HEAD, ..). Series applied and queued up for -stable, thanks.
Re: [PATCH v2 net] tcp: make challenge acks less predictable
I see your point and I agree with you that SSL protects victims from this hijacking attack, especially with full HSTS. For Windows case, since Windows is a black box for us, we tested its Challenge ACK mechanism with Windows Server 2012 R2 Base and Windows Server 2008 R2 from Amazon EC2. The results show that Windows also add some strategies to mitigate blind in-window attack problem, but the mitigated results are not as same as what mentioned in RFC 5961. Please let me know if I said something wrong. Thanks for the fix! Best, Yue On Mon, Jul 11, 2016 at 1:02 AM, Eric Dumazet wrote: > On Sun, 2016-07-10 at 11:28 -0700, Yue Cao wrote: >> This second patch does make our attack much harder but it's still >> possible to do such off-path attack with enough network bandwidth. >> Here is our modified attack for this second patch. >> >> Modified Attack: >> Main idea of our attack is to send multiple same spoofed packets in 1 >> second so attacker can confirm if it's a right guess or wrong guess. >> In more detail, attacker sends more than 1000 (e.g. 1500) spoofed >> packets for a same guessed value at beginning. After that, attacker >> sends 1500 packets during the same second to determine whether >> previous guess is right or wrong, by using following rules: >> If attacker receives less than 500 Challenge ACKs, it's a right guess. >> For a example, if 1500 spoofed packets are sent with a correct >> value(right guess), all Challenge ACKs will be sent to victim client >> in that second and attacker receives nothing. Otherwise, it's a wrong >> guess. >> >> Since this global rate limit always leaks some information as a >> side-channel, we are wondering if eliminating it completely would be a >> good idea. In fact, according to our latest test, FreeBSD and Windows >> do not have any such rate limit implemented. Looking forward to your >> replies. > > Are you sure Windows is implementing RFC 5961 ? Linux got in in 3.6. > > We do want RFC 5961, compared to the small nuisance of the attack you > describe. > > Nuisance of having a way for hackers to send a RST packet after > consuming thousands of probe packets is nothing, compared to the > nuisance of ACK storms we had before rate limiting was added in 3.6 (and > refined in 4.0). This was a serious problem for real servers, because of > buggy firewalls and appliances. > > You probably know that if someone worries about TCP flows being > compromised, it should use SSL, so that traffic injection is less likely > to happen. > > Most TCP flows in the Internet are short lived (less than 1 minute). > > Having to establish about 500 flows to the victim is already a > challenge, since the victim would already be in trouble if it was > allowing so many idle flows. > > So the 'solution' would be to backport > f2b2c582e82429270d5818fbabe653f4359d7024 > ("tcp: mitigate ACK loops for connections as tcp_sock") > > Then apply the v2 patch so that the limit is randomized. > > Then set the default limit to 2^31 > > >
Re: [PATCH net-next 00/10] Mellanox 100G mlx5 Bulk flow statistics and SRIOV TC offloads
From: David Miller Date: Wed, 13 Jul 2016 11:46:25 -0700 (PDT) > Series applied, thanks. Actually, I have to revert. Please fix this build warning and resubmit, thanks. drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c: In function ‘mlx5_fc_stats_work’: drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c:191:48: warning: ‘last’ may be used uninitialized in this function [-Wmaybe-uninitialized] node = mlx5_fc_stats_query(dev, counter, last->id); ^
[iproute PATCH v4 4/5] No need to initialize rtattr fields before parsing
Since parse_rtattr_flags() calls memset already, there is no need for callers to do so themselves. Signed-off-by: Phil Sutter --- ip/ipaddress.c | 2 +- tc/tc_class.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ip/ipaddress.c b/ip/ipaddress.c index cfcebe76af399..60862c5700330 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -449,7 +449,7 @@ static void print_num(FILE *fp, unsigned int width, uint64_t count) static void print_vf_stats64(FILE *fp, struct rtattr *vfstats) { - struct rtattr *vf[IFLA_VF_STATS_MAX + 1] = {}; + struct rtattr *vf[IFLA_VF_STATS_MAX + 1]; if (vfstats->rta_type != IFLA_VF_STATS) { fprintf(stderr, "BUG: rta type is %d\n", vfstats->rta_type); diff --git a/tc/tc_class.c b/tc/tc_class.c index 1690ec1bbfad8..f3864d22f3c4d 100644 --- a/tc/tc_class.c +++ b/tc/tc_class.c @@ -219,7 +219,7 @@ static void graph_cls_show(FILE *fp, char *buf, struct hlist_head *root_list, { struct hlist_node *n, *tmp_cls; char cls_id_str[256] = {}; - struct rtattr *tb[TCA_MAX + 1] = {}; + struct rtattr *tb[TCA_MAX + 1]; struct qdisc_util *q; char str[100] = {}; @@ -304,7 +304,7 @@ int print_class(const struct sockaddr_nl *who, FILE *fp = (FILE *)arg; struct tcmsg *t = NLMSG_DATA(n); int len = n->nlmsg_len; - struct rtattr *tb[TCA_MAX + 1] = {}; + struct rtattr *tb[TCA_MAX + 1]; struct qdisc_util *q; char abuf[256]; -- 2.8.2
[iproute PATCH v4 2/5] Use C99 style initializers everywhere
This big patch was compiled by vimgrepping for memset calls and changing to C99 initializer if applicable. One notable exception is the initialization of union bpf_attr in tc/tc_bpf.c: changing it would break for older gcc versions (at least <=3.4.6). Calls to memset for struct rtattr pointer fields for parse_rtattr*() were just dropped since they are not needed. The changes here allowed the compiler to discover some unused variables, so get rid of them, too. Signed-off-by: Phil Sutter --- Changes since v3: - Use empty instead of zero initializer. Changes since v2: - Flatten initializers. - Leave a final comma in place. - Fix checkpatch warnings. - Initialize nlmsg_seq in the declaration, too. - Use C99-style init in tc_bpf.c to get rid of the memset(). Changes since v1: - Dropped former changes to tc/tc_bpf.c as they are incompatible to older gcc versions (at least <=3.4.6). --- bridge/fdb.c | 25 ++--- bridge/link.c| 14 +++ bridge/mdb.c | 17 - bridge/vlan.c| 17 - genl/ctrl.c | 44 +- ip/ip6tunnel.c | 10 ++--- ip/ipaddress.c | 31 +++- ip/ipaddrlabel.c | 21 --- ip/iplink.c | 61 +- ip/iplink_can.c | 4 +- ip/ipmaddr.c | 25 - ip/ipmroute.c| 8 +--- ip/ipneigh.c | 30 ++- ip/ipnetconf.c | 10 ++--- ip/ipnetns.c | 39 +--- ip/ipntable.c| 25 - ip/iproute.c | 78 ++- ip/iprule.c | 22 +-- ip/iptoken.c | 19 -- ip/iptunnel.c| 31 +--- ip/ipxfrm.c | 26 - ip/link_gre.c| 18 - ip/link_gre6.c | 18 - ip/link_ip6tnl.c | 25 + ip/link_iptnl.c | 22 +-- ip/link_vti.c| 18 - ip/link_vti6.c | 18 - ip/xfrm_policy.c | 99 - ip/xfrm_state.c | 110 +++ lib/libnetlink.c | 77 ++ lib/ll_map.c | 1 - misc/arpd.c | 64 ++-- misc/ss.c| 37 +++ tc/e_bpf.c | 7 +--- tc/em_cmp.c | 4 +- tc/em_ipset.c| 4 +- tc/em_meta.c | 4 +- tc/em_nbyte.c| 4 +- tc/em_u32.c | 4 +- tc/f_flow.c | 3 -- tc/f_flower.c| 3 +- tc/f_fw.c| 6 +-- tc/f_route.c | 3 -- tc/f_rsvp.c | 6 +-- tc/f_u32.c | 12 ++ tc/m_bpf.c | 5 +-- tc/m_csum.c | 4 +- tc/m_ematch.c| 4 +- tc/m_gact.c | 5 +-- tc/m_ife.c | 5 +-- tc/m_mirred.c| 7 +--- tc/m_nat.c | 4 +- tc/m_pedit.c | 8 +--- tc/m_police.c| 5 +-- tc/q_atm.c | 3 +- tc/q_cbq.c | 22 +++ tc/q_choke.c | 4 +- tc/q_codel.c | 3 +- tc/q_dsmark.c| 1 - tc/q_fifo.c | 4 +- tc/q_fq_codel.c | 3 +- tc/q_hfsc.c | 13 ++- tc/q_htb.c | 15 +++- tc/q_netem.c | 16 +++- tc/q_red.c | 4 +- tc/q_sfb.c | 17 - tc/q_sfq.c | 4 +- tc/q_tbf.c | 4 +- tc/tc_bpf.c | 54 ++- tc/tc_class.c| 31 ++-- tc/tc_exec.c | 3 +- tc/tc_filter.c | 33 ++--- tc/tc_qdisc.c| 33 ++--- tc/tc_stab.c | 4 +- tc/tc_util.c | 3 +- 75 files changed, 532 insertions(+), 913 deletions(-) diff --git a/bridge/fdb.c b/bridge/fdb.c index be849f980a802..59538b1e16506 100644 --- a/bridge/fdb.c +++ b/bridge/fdb.c @@ -177,16 +177,15 @@ static int fdb_show(int argc, char **argv) struct nlmsghdr n; struct ifinfomsgifm; charbuf[256]; - } req; + } req = { + .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), + .ifm.ifi_family = PF_BRIDGE, + }; char *filter_dev = NULL; char *br = NULL; int msg_size = sizeof(struct ifinfomsg); - memset(&req, 0, sizeof(req)); - req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); - req.ifm.ifi_family = PF_BRIDGE; - while (argc > 0) { if ((strcmp(*argv, "brport") == 0) || strcmp(*argv, "dev") == 0) { NEXT_ARG(); @@ -247,7 +246,13 @@ static int fdb_modify(int cmd, int flags, int argc, char **argv) struct nlmsghdr n; struct ndmsgndm; charbuf[256]; - } req; + } req = { + .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)), + .n.nlmsg_flags = NLM_F_REQUEST | flags, + .n.nlmsg_type = cmd, + .ndm.ndm_family = PF_BRIDGE, + .ndm.ndm_state = NUD_NOARP, + }; char *addr = NULL; char *
[iproute PATCH v4 1/5] tc: m_action: Improve conversion to C99 style initializers
This improves my initial change in the following points: - Flatten embedded struct's initializers. - No need to initialize variables to zero as the key feature of C99 initializers is to do this implicitly. - By relocating the declaration of struct rtattr *tail, it can be initialized at the same time. Fixes: a0a73b298a579 ("tc: m_action: Use C99 style initializers for struct req") Signed-off-by: Phil Sutter --- Changes since v2: - Don't drop the "superfluous" comma. - Flatten initializers. Changes since v1: - Created this patch. --- tc/m_action.c | 23 +++ 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/tc/m_action.c b/tc/m_action.c index ea16817aefd4f..806fdd197965d 100644 --- a/tc/m_action.c +++ b/tc/m_action.c @@ -395,13 +395,10 @@ static int tc_action_gd(int cmd, unsigned int flags, int *argc_p, char ***argv_p struct tcamsg t; charbuf[MAX_MSG]; } req = { - .n = { - .nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)), - .nlmsg_flags = NLM_F_REQUEST | flags, - .nlmsg_type = cmd, - }, + .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)), + .n.nlmsg_flags = NLM_F_REQUEST | flags, + .n.nlmsg_type = cmd, .t.tca_family = AF_UNSPEC, - .buf = { 0 } }; argc -= 1; @@ -491,23 +488,18 @@ static int tc_action_modify(int cmd, unsigned int flags, int *argc_p, char ***ar int argc = *argc_p; char **argv = *argv_p; int ret = 0; - - struct rtattr *tail; struct { struct nlmsghdr n; struct tcamsg t; charbuf[MAX_MSG]; } req = { - .n = { - .nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)), - .nlmsg_flags = NLM_F_REQUEST | flags, - .nlmsg_type = cmd, - }, + .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)), + .n.nlmsg_flags = NLM_F_REQUEST | flags, + .n.nlmsg_type = cmd, .t.tca_family = AF_UNSPEC, - .buf = { 0 } }; + struct rtattr *tail = NLMSG_TAIL(&req.n); - tail = NLMSG_TAIL(&req.n); argc -= 1; argv += 1; if (parse_action(&argc, &argv, TCA_ACT_TAB, &req.n)) { @@ -540,7 +532,6 @@ static int tc_act_list_or_flush(int argc, char **argv, int event) } req = { .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)), .t.tca_family = AF_UNSPEC, - .buf = { 0 } }; tail = NLMSG_TAIL(&req.n); -- 2.8.2
[iproute PATCH v4 5/5] Makefile: Allow to override CC
This makes it easier to build iproute2 with a custom compiler. While at it, make HOSTCC default to the value of CC if not explicitly set elsewhere. Signed-off-by: Phil Sutter --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 15c81ecfdca3a..fa200ddb76679 100644 --- a/Makefile +++ b/Makefile @@ -34,8 +34,8 @@ ADDLIB+=ipx_ntop.o ipx_pton.o #options for mpls ADDLIB+=mpls_ntop.o mpls_pton.o -CC = gcc -HOSTCC = gcc +CC := gcc +HOSTCC ?= $(CC) DEFINES += -D_GNU_SOURCE # Turn on transparent support for LFS DEFINES += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -- 2.8.2
[iproute PATCH v4 3/5] Replace malloc && memset by calloc
This only replaces occurrences where the newly allocated memory is cleared completely afterwards, as in other cases it is a theoretical performance hit although code would be cleaner this way. Signed-off-by: Phil Sutter --- Changes since v2: - Fix checkpatch errors. --- genl/genl.c| 3 +-- lib/names.c| 7 ++- misc/lnstat.c | 6 ++ misc/lnstat_util.c | 4 +--- tc/em_canid.c | 4 ++-- tc/m_action.c | 3 +-- tc/m_ipt.c | 13 - tc/m_pedit.c | 3 +-- tc/tc.c| 9 +++-- tc/tc_bpf.c| 4 +--- tc/tc_class.c | 3 +-- tc/tc_exec.c | 3 +-- 12 files changed, 20 insertions(+), 42 deletions(-) diff --git a/genl/genl.c b/genl/genl.c index e33fafdf2f524..747074b029a7b 100644 --- a/genl/genl.c +++ b/genl/genl.c @@ -86,9 +86,8 @@ reg: return f; noexist: - f = malloc(sizeof(*f)); + f = calloc(1, sizeof(*f)); if (f) { - memset(f, 0, sizeof(*f)); strncpy(f->name, str, 15); f->parse_genlopt = parse_nofopt; f->print_genlopt = print_nofopt; diff --git a/lib/names.c b/lib/names.c index 3b5b0b1e1201a..fbd6503f22d42 100644 --- a/lib/names.c +++ b/lib/names.c @@ -54,15 +54,12 @@ struct db_names *db_names_alloc(void) { struct db_names *db; - db = malloc(sizeof(*db)); + db = calloc(1, sizeof(*db)); if (!db) return NULL; - memset(db, 0, sizeof(*db)); - db->size = MAX_ENTRIES; - db->hash = malloc(sizeof(struct db_entry *) * db->size); - memset(db->hash, 0, sizeof(struct db_entry *) * db->size); + db->hash = calloc(db->size, sizeof(struct db_entry *)); return db; } diff --git a/misc/lnstat.c b/misc/lnstat.c index 659a01bd69931..863fd4d9f03f2 100644 --- a/misc/lnstat.c +++ b/misc/lnstat.c @@ -182,10 +182,8 @@ static struct table_hdr *build_hdr_string(struct lnstat_file *lnstat_files, static struct table_hdr th; int ofs = 0; - for (i = 0; i < HDR_LINES; i++) { - th.hdr[i] = malloc(HDR_LINE_LENGTH); - memset(th.hdr[i], 0, HDR_LINE_LENGTH); - } + for (i = 0; i < HDR_LINES; i++) + th.hdr[i] = calloc(1, HDR_LINE_LENGTH); for (i = 0; i < fps->num; i++) { char *cname, *fname = fps->params[i].lf->name; diff --git a/misc/lnstat_util.c b/misc/lnstat_util.c index d918151282f55..cc54598fe1bef 100644 --- a/misc/lnstat_util.c +++ b/misc/lnstat_util.c @@ -173,15 +173,13 @@ static struct lnstat_file *alloc_and_open(const char *path, const char *file) struct lnstat_file *lf; /* allocate */ - lf = malloc(sizeof(*lf)); + lf = calloc(1, sizeof(*lf)); if (!lf) { fprintf(stderr, "out of memory\n"); return NULL; } /* initialize */ - memset(lf, 0, sizeof(*lf)); - /* de->d_name is guaranteed to be <= NAME_MAX */ strcpy(lf->basename, file); strcpy(lf->path, path); diff --git a/tc/em_canid.c b/tc/em_canid.c index 16f6ed5c0b7a4..ceb64cb933f51 100644 --- a/tc/em_canid.c +++ b/tc/em_canid.c @@ -106,8 +106,8 @@ static int canid_parse_eopt(struct nlmsghdr *n, struct tcf_ematch_hdr *hdr, if (args == NULL) return PARSE_ERR(args, "canid: missing arguments"); - rules.rules_raw = malloc(sizeof(struct can_filter) * rules.rules_capacity); - memset(rules.rules_raw, 0, sizeof(struct can_filter) * rules.rules_capacity); + rules.rules_raw = calloc(rules.rules_capacity, +sizeof(struct can_filter)); do { if (!bstrcmp(args, "sff")) { diff --git a/tc/m_action.c b/tc/m_action.c index 806fdd197965d..24f8b5d855211 100644 --- a/tc/m_action.c +++ b/tc/m_action.c @@ -126,9 +126,8 @@ noexist: goto restart_s; } #endif - a = malloc(sizeof(*a)); + a = calloc(1, sizeof(*a)); if (a) { - memset(a, 0, sizeof(*a)); strncpy(a->id, "noact", 15); a->parse_aopt = parse_noaopt; a->print_aopt = print_noaopt; diff --git a/tc/m_ipt.c b/tc/m_ipt.c index 098f610f9439a..d6f62bd6b32c9 100644 --- a/tc/m_ipt.c +++ b/tc/m_ipt.c @@ -164,16 +164,11 @@ get_target_name(const char *name) return NULL; #endif - new_name = malloc(strlen(name) + 1); - lname = malloc(strlen(name) + 1); - if (new_name) - memset(new_name, '\0', strlen(name) + 1); - else + new_name = calloc(1, strlen(name) + 1); + lname = calloc(1, strlen(name) + 1); + if (!new_name) exit_error(PARAMETER_PROBLEM, "get_target_name"); - - if (lname) - memset(lname, '\0', strlen(name) + 1); - else + if (!lname) exit_error(PARAMETER_PROBLEM, "get_target_name"); strcpy(new_name, name); diff --git a/tc/m_ped
Re: [PATCH net-next 00/10] Mellanox 100G mlx5 Bulk flow statistics and SRIOV TC offloads
From: Saeed Mahameed Date: Wed, 13 Jul 2016 00:28:56 +0300 > This series from Amir and Or deals with two enhancements for the mlx5 TC > offloads. > > The 1st two patches add bulk reading of flow counters. Few bulk counter > queries are > used instead of issuing thousands firmware commands per second to get > statistics of all > flows set to HW. > > The next patches add TC based SRIOV offloading to mlx5, as a follow up for > the e-switch > offloads mode and the VF representors. When the e-switch is set to the (new) > "offloads" > mode, we can now offload TC/flower drop and forward rules, the forward action > we offload > is TC mirred/redirect. > > The above is done by the VF representor netdevices exporting the setup_tc ndo > where from > there we're re-using and enhancing the existing mlx5 TC offloads sub-module > which now > works for both the NIC and the SRIOV cases. > > The series is applied on top d3fc0353f7c7 ('ipv4: af_inet: make it explicitly > non-modular') > and it has no merge issues with the on-going net submission ('mlx5 tx timeout > watchdog fixes') Series applied, thanks.
[iproute PATCH v4 0/5] Big C99 style initializer rework
This is v4 of my C99-style initializer related patch series. The changes since v3 are: - Use empty initializer instead of the universal zero initializer: The latter one triggers warnings in older GCCs, and this appears to be the least intrusive workaround. Plus, empty initializers are used all over the code already, so it won't make things worse. (GCC in pedantic mode does not like them, but that is a can of worms by itself.) - Dropped patch 6 (unsigned value comparison simplification): It unintendedly changes that comparison's semantics, and I am not completely sure the change is correct - therefore rather leave it as is. - Rebased onto current origin/master again (no conflicts). For reference, here's the v3 changelog: - Flattened embedded struct's initializers: Since the field names are very short, I figured it makes more sense to keep indenting low. Also, the same style is already used in ip/xfrm_policy.c so take that as an example. - Moved leftover nlmsg_seq initializing into the common place as well: I was unsure whether this is a good idea at first (due to the increment), but again it's done in ip/xfrm_policy.c as well so should be fine. - Added a comma after the last field initializer as suggested by Jakub. - Dropped patch 7 since it was NACKed. - Eliminated checkpatch non-compliance. - Second go at union bpf_attr in tc/tc_bpf.c: I figured that while it is not possible to initialize fields, gcc-3.4.6 does not complain when setting the whole union to zero using '= {0}'. So I did this and thereby at least got rid of the memset calls. For reference, here's the v2 changelog: - Rebased onto current upstream master: My own commit a0a73b298a579 ("tc: m_action: Use C99 style initializers for struct req") contains most of the changes to tc/m_action.c already, so I put the remaining ones into a dedicated patch (the first one here) with a better description. - Tested against gcc-3.4.6: This is the oldest gcc version I was able to install locally. It indeed does not like the former changes in tc/tc_bpf.c, so I reverted them. Apart from emitting many warnings, it successfully compiles the sources. In the process of compatibility testing, I made a few more changes which make sense to have: - New patch 5 allows to conveniently override the compiler via command line. - New patch 6 eliminates a warning with old gcc but looks valid in general. - A warning made me look at ip/tcp_metrics.c and I found a minor code simplification (patch 7). Phil Sutter (5): tc: m_action: Improve conversion to C99 style initializers Use C99 style initializers everywhere Replace malloc && memset by calloc No need to initialize rtattr fields before parsing Makefile: Allow to override CC Makefile | 4 +- bridge/fdb.c | 25 ++-- bridge/link.c | 14 +++ bridge/mdb.c | 17 - bridge/vlan.c | 17 - genl/ctrl.c| 44 + genl/genl.c| 3 +- ip/ip6tunnel.c | 10 ++--- ip/ipaddress.c | 33 +++- ip/ipaddrlabel.c | 21 -- ip/iplink.c| 61 - ip/iplink_can.c| 4 +- ip/ipmaddr.c | 25 ip/ipmroute.c | 8 +--- ip/ipneigh.c | 30 ++- ip/ipnetconf.c | 10 ++--- ip/ipnetns.c | 39 +-- ip/ipntable.c | 25 ip/iproute.c | 78 + ip/iprule.c| 22 +-- ip/iptoken.c | 19 - ip/iptunnel.c | 31 +-- ip/ipxfrm.c| 26 - ip/link_gre.c | 18 - ip/link_gre6.c | 18 - ip/link_ip6tnl.c | 25 +--- ip/link_iptnl.c| 22 +-- ip/link_vti.c | 18 - ip/link_vti6.c | 18 - ip/xfrm_policy.c | 99 +++ ip/xfrm_state.c| 110 ++--- lib/libnetlink.c | 77 ++--- lib/ll_map.c | 1 - lib/names.c| 7 +--- misc/arpd.c| 64 ++- misc/lnstat.c | 6 +-- misc/lnstat_util.c | 4 +- misc/ss.c | 37 +++--- tc/e_bpf.c | 7 +--- tc/em_canid.c | 4 +- tc/em_cmp.c| 4 +- tc/em_ipset.c | 4 +- tc/em_meta.c | 4 +- tc/em_nbyte.c | 4 +- tc/em_u32.c| 4 +- tc/f_flow.c| 3 -- tc/f_flower.c | 3 +- tc/f_fw.c | 6 +-- tc/f_route.c | 3 -- tc/f_rsvp.c| 6 +-- tc/f_u32.c | 12 ++ tc/m_action.c | 26 - tc/m_bpf.c | 5 +-- tc/m_csum.c| 4 +- tc/m_ematch.c | 4 +- tc/m_gact.c| 5 +-- tc/m_ife.c | 5 +-- tc/m_ipt.c | 13 ++- tc/m_mirred.c | 7 +--- tc/m_nat.c
Re: [PATCH net 0/2] mlx5 tx timeout watchdog fixes
From: Saeed Mahameed Date: Wed, 13 Jul 2016 00:06:58 +0300 > This patch set provides two trivial fixes for the tx timeout series lately > applied into net 4.7. > > From Daniel, detect stuck queues due to BQL > From Mohamad, fix tx timeout watchdog false alarm > > Hopefully those two fixes will make it to -stable, assuming > 3947ca185999 ('net/mlx5e: Implement ndo_tx_timeout callback') was also > backported to -stable. Series applied.
Re: [PATCH net-next] net: vrf: Documentation update
From: David Ahern Date: Tue, 12 Jul 2016 15:04:23 -0600 > Update vrf documentation for changes made to 4.4 - 4.8 kernels > and iproute2 support for vrf keyword. > > Signed-off-by: David Ahern Applied, thanks David.
pull-request: wireless-drivers-next 2016-07-13
Hi Dave, here's a pull request for net-next. This time there are few conflicts due to the cfg80211 scan API changes, and one of them is easy to miss, so please pay extra attention to them. Otherwise there's not nothing really out of ordinary. Please note that I also pulled wireless-drivers to wireless-drivers-next to reduce the amount of conflicts. So about the conflicts, the obvious are notified by git: CONFLICT (content): Merge conflict in drivers/net/wireless/marvell/mwifiex/cmdevt.c CONFLICT (content): Merge conflict in drivers/net/wireless/intel/iwlwifi/mvm/scan.c Basically the major change is that in iwlwifi del_timer() is changed to cancel_delayed_work() and in mwifiex the code was refactored to use mwifiex_cancel_scan(). But the tricky part comes here which is easy to miss: Auto-merging drivers/net/wireless/marvell/mwifiex/scan.c You need to convert the scan code in mwifiex_cancel_scan(): cfg80211_scan_done(priv->scan_request, 1); to use the new API: struct cfg80211_scan_info info = { .aborted = true, }; [...] cfg80211_scan_done(priv->scan_request, &info); I have attached the output from git diff as an example how to resolve this, hopefully that helps. Please let me know if there are any problems or if you want to handle these differently. Kalle The following changes since commit 742fb20fd4c75bd08733b0ea232c7e0fa67a6f87: net: ethernet: ti: cpdma: switch to use genalloc (2016-06-29 04:16:11 -0400) are available in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers-next.git tags/wireless-drivers-next-for-davem-2016-07-13 for you to fetch changes up to 25f700ef0653d7644ed273f8770230e734cae726: iwlwifi: add missing type declaration (2016-07-12 14:51:57 +0300) wireless-drivers-next patches for 4.8 Major changes: iwlwifi * more work on the RX path for the 9000 device series * some more dynamic queue allocation work * SAR BIOS implementation * some work on debugging capabilities * added support for GCMP encryption * data path rework in preparation for new HW * some cleanup to remove transport dependency on mac80211 * support for MSIx in preparation for new HW * lots of work in preparation for HW support (9000 and a000 series) mwifiex * implement get_tx_power and get_antenna cfg80211 operation callbacks wl18xx * add support for 64bit clock rtl8xxxu * aggregation support (optional for now) Also wireless-drivers is merged to fix some conflicts. Amitkumar Karwar (8): mwifiex: fix system hang problem after resume mwifiex: fix AP unable to start in VHT40 problem mwifiex: fix AP start problem for newly added interface mwifiex: code rearrangement in suspend handler mwifiex: clear scan_aborting flag mwifiex: fix NULL pointer dereference during suspend mwifiex: fix scan_block flag handling mwifiex: Change default firmware for PCIe8997 chipset Andrei Otcheretianski (1): iwlwifi: mvm: Support CSA countdown offloading Andy Shevchenko (1): rtl8xxxu: tuse %*ph to dump buffers Arnd Bergmann (6): rtlwifi: use s8 instead of char wireless: airo: rename 'register' variable wireless: brcmsmac: fix old-style declaration wireless: ipw2200: fix old-style declaration iwlwifi: mvm: avoid harmless -Wmaybe-uninialized warning iwlwifi: add missing type declaration Avraham Stern (1): iwlwifi: rename CAPA_P2P_STANDALONE_UAPSD to CAPA_P2P_SCM_UAPSD Ayala Beker (2): iwlwifi: mvm: fix RX mpdu status enum iwlwifi: mvm: add support for GCMP encryption Bhaktipriya Shridhar (1): libertas_tf: Remove create_workqueue Brian Norris (1): mwifiex: mask PCIe interrupts before removal Bruno Herrera (1): wlcore: sdio: Fix crash on wlcore_probe_of when failing to parse/map irq Dan Carpenter (2): iwlwifi: mvm: remove an unused variable iwlwifi: mvm: silence uninitialized variable warning Emmanuel Grumbach (7): iwlwifi: advertise maximal MPDU length when Rx MQ is supported iwlwifi: pcie: enable interrupts before releasing the NIC's CPU iwlwifi: mvm: cleanup the coex code iwlwifi: mvm: fix coex related comments iwlwifi: mvm: fix the channel inhibition table for Channel 14 iwlwifi: mvm: unmap the paging memory before freeing it iwlwifi: pcie: fix a race in firmware loading flow Ganapathi Bhat (1): mwifiex: Fix an issue spotted by KASAN Golan Ben-Ami (2): iwlwifi: Reserve iwl_fw_error_dump_type enum iwlwifi: mvm: write the correct internal TXF index Gregory Greenman (1): iwlwifi: mvm: rs: add rate scaling support for 160MHz channels Guenter Roeck (1): iwlwifi: dvm: Remove unused array 'iwlagn_loose_lookup' Guy Mishol (1): wlcore: reconfigure sta rates on authori
[PATCH net-next 6/6] sctp: only check for ECN if peer is using it
Currently only read-only checks are performed up to the point on where we check if peer is ECN capable, checks which we can avoid otherwise. The flag ecn_ce_done is only used to perform this check once per incoming packet, and nothing more. Thus this patch moves the peer check up. Signed-off-by: Marcelo Ricardo Leitner --- net/sctp/sm_statefuns.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index b7c1f7f3c8388400e51e3fbdbe099bc354559913..d88bb2b0b69913ad5962f9a5655d413f2c210ed0 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6118,12 +6118,11 @@ static int sctp_eat_data(const struct sctp_association *asoc, * chunk later. */ - if (!chunk->ecn_ce_done) { + if (asoc->peer.ecn_capable && !chunk->ecn_ce_done) { struct sctp_af *af = SCTP_INPUT_CB(chunk->skb)->af; chunk->ecn_ce_done = 1; - if (af->is_ce(sctp_gso_headskb(chunk->skb)) && - asoc->peer.ecn_capable) { + if (af->is_ce(sctp_gso_headskb(chunk->skb))) { /* Do real work as sideffect. */ sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE, SCTP_U32(tsn)); -- 2.7.4
[PATCH net-next 0/6] sctp: allow GSO frags to access the chunk too
Patchset is named after the most important fix in it. First two patches are preparing the grounds for the 3rd patch. After the 3rd, they are not strictly logically related to the patchset, but I kept them together as they depend on each other. More details on patch changelogs. Thanks! Marcelo Ricardo Leitner (6): sctp: allow others to use sctp_input_cb sctp: reorder sctp_ulpevent and shrink msg_flags sctp: allow GSO frags to access the chunk too sctp: avoid identifying address family many times for a chunk sctp: do not clear chunk->ecn_ce_done flag sctp: only check for ECN if peer is using it include/net/sctp/structs.h | 23 +++ include/net/sctp/ulpevent.h | 12 ++-- net/sctp/input.c| 12 +--- net/sctp/inqueue.c | 9 - net/sctp/ipv6.c | 9 - net/sctp/protocol.c | 1 + net/sctp/sm_make_chunk.c| 20 net/sctp/sm_statefuns.c | 9 +++-- net/sctp/socket.c | 10 +++--- net/sctp/ulpevent.c | 14 +++--- 10 files changed, 68 insertions(+), 51 deletions(-) -- 2.7.4
[PATCH net-next 1/6] sctp: allow others to use sctp_input_cb
We process input path in other files too and having access to it is nice, so move it to a header where it's shared. Signed-off-by: Marcelo Ricardo Leitner --- include/net/sctp/structs.h | 15 +++ net/sctp/input.c | 11 --- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 8626bdd3249a9283955fe81bc3255be0a18717f9..966c3a40039c12a7c525612594a51312d5de1d2a 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -59,6 +59,7 @@ #include/* We need tq_struct.*/ #include /* We need sctp* header structs. */ #include /* We need auth specific structs */ +#include /* For inet_skb_parm */ /* A convenience structure for handling sockaddr structures. * We should wean ourselves off this. @@ -1092,6 +1093,20 @@ static inline void sctp_outq_cork(struct sctp_outq *q) q->cork = 1; } +/* SCTP skb control block. + * sctp_input_cb is currently used on rx and sock rx queue + */ +struct sctp_input_cb { + union { + struct inet_skb_parmh4; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_skb_parm h6; +#endif + } header; + struct sctp_chunk *chunk; +}; +#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) + /* These bind address data fields common between endpoints and associations */ struct sctp_bind_addr { diff --git a/net/sctp/input.c b/net/sctp/input.c index 6f8e676d285ead987b0a1337beec3b29c34e0a8e..7a327ff71f08985f6ebb963d5cdc9540b23d0666 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -90,17 +90,6 @@ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb) return 0; } -struct sctp_input_cb { - union { - struct inet_skb_parmh4; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; -#endif - } header; - struct sctp_chunk *chunk; -}; -#define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) - /* * This is the routine which IP calls when receiving an SCTP packet. */ -- 2.7.4
[PATCH net-next 4/6] sctp: avoid identifying address family many times for a chunk
Identifying address family operations during rx path is not something expensive but it's ugly to the eye to have it done multiple times, specially when we already validated it during initial rx processing. This patch takes advantage of the now shared sctp_input_cb and make the pointer to the operations readily available. Signed-off-by: Marcelo Ricardo Leitner --- include/net/sctp/structs.h | 1 + net/sctp/input.c | 1 + net/sctp/inqueue.c | 1 + net/sctp/sm_make_chunk.c | 20 net/sctp/sm_statefuns.c| 7 ++- 5 files changed, 9 insertions(+), 21 deletions(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index f6f201de6fa46b3ca203c00f4970ca408edb6930..ce93c4b10d2620a3ac4c9efe39a86e5d231b51c2 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1104,6 +1104,7 @@ struct sctp_input_cb { #endif } header; struct sctp_chunk *chunk; + struct sctp_af *af; }; #define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) diff --git a/net/sctp/input.c b/net/sctp/input.c index 7a327ff71f08985f6ebb963d5cdc9540b23d0666..30d72f7707b6df5b41679bbfc5e595d5a11130ea 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -140,6 +140,7 @@ int sctp_rcv(struct sk_buff *skb) af = sctp_get_af_specific(family); if (unlikely(!af)) goto discard_it; + SCTP_INPUT_CB(skb)->af = af; /* Initialize local addresses for lookups. */ af->from_skb(&src, skb, 1); diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 147d975b04559f7858b040b1f04dbc559ef2ec78..8fc773f9b59a8a9ad123dd132cfa5b7f916732b6 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -224,6 +224,7 @@ new_skb: *head_cb = SCTP_INPUT_CB(chunk->head_skb); cb->chunk = head_cb->chunk; + cb->af = head_cb->af; } } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 1c96f4740e67397e5f8b7134cffd4d0840220245..8c77b87a8565cb4f82c09cea65557dc9c8d1138f 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -108,14 +108,9 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk) /* What was the inbound interface for this chunk? */ int sctp_chunk_iif(const struct sctp_chunk *chunk) { - struct sctp_af *af; - int iif = 0; - - af = sctp_get_af_specific(ipver2af(ip_hdr(chunk->skb)->version)); - if (af) - iif = af->skb_iif(chunk->skb); + struct sk_buff *skb = chunk->skb; - return iif; + return SCTP_INPUT_CB(skb)->af->skb_iif(skb); } /* RFC 2960 3.3.2 Initiation (INIT) (1) @@ -1600,7 +1595,6 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, struct sctp_association *asoc; struct sk_buff *skb; sctp_scope_t scope; - struct sctp_af *af; /* Create the bare association. */ scope = sctp_scope(sctp_source(chunk)); @@ -1610,16 +1604,10 @@ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, asoc->temp = 1; skb = chunk->skb; /* Create an entry for the source address of the packet. */ - af = sctp_get_af_specific(ipver2af(ip_hdr(skb)->version)); - if (unlikely(!af)) - goto fail; - af->from_skb(&asoc->c.peer_addr, skb, 1); + SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1); + nodata: return asoc; - -fail: - sctp_association_free(asoc); - return NULL; } /* Build a cookie representing asoc. diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 5aabf42065e2fba9388350996310b77c58369395..b7c1f7f3c8388400e51e3fbdbe099bc354559913 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -6119,13 +6119,10 @@ static int sctp_eat_data(const struct sctp_association *asoc, */ if (!chunk->ecn_ce_done) { - struct sctp_af *af; + struct sctp_af *af = SCTP_INPUT_CB(chunk->skb)->af; chunk->ecn_ce_done = 1; - af = sctp_get_af_specific( - ipver2af(ip_hdr(chunk->skb)->version)); - - if (af && af->is_ce(sctp_gso_headskb(chunk->skb)) && + if (af->is_ce(sctp_gso_headskb(chunk->skb)) && asoc->peer.ecn_capable) { /* Do real work as sideffect. */ sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE, -- 2.7.4
[PATCH net-next 5/6] sctp: do not clear chunk->ecn_ce_done flag
We should not clear that flag when switching to a new skb from a GSO skb because it would cause ECN processing to happen multiple times per GSO skb, which is not wanted. Instead, let it be processed once per chunk. That is, in other words, once per IP header available. Fixes: 90017accff61 ("sctp: Add GSO support") Signed-off-by: Marcelo Ricardo Leitner --- net/sctp/inqueue.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 8fc773f9b59a8a9ad123dd132cfa5b7f916732b6..942770675f4cc0efc9686f4e4038450f060f34ae 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -217,7 +217,6 @@ new_skb: chunk->auth = 0; chunk->has_asconf = 0; chunk->end_of_packet = 0; - chunk->ecn_ce_done = 0; if (chunk->head_skb) { struct sctp_input_cb *cb = SCTP_INPUT_CB(chunk->skb), -- 2.7.4
[PATCH net-next 3/6] sctp: allow GSO frags to access the chunk too
SCTP will try to access original IP headers on sctp_recvmsg in order to copy the addresses used. There are also other places that do similar access to IP or even SCTP headers. But after 90017accff61 ("sctp: Add GSO support") they aren't always there because they are only present in the header skb. SCTP handles the queueing of incoming data by cloning the incoming skb and limiting to only the relevant payload. This clone has its cb updated to something different and it's then queued on socket rx queue. Thus we need to fix this in two moments. For rx path, not related to socket queue yet, this patch uses a partially copied sctp_input_cb to such GSO frags. This restores the ability to access the headers for this part of the code. Regarding the socket rx queue, it removes iif member from sctp_event and also add a chunk pointer on it. With these changes we're always able to reach the headers again. The biggest change here is that now the sctp_chunk struct and the original skb are only freed after the application consumed the buffer. Note however that the original payload was already like this due to the skb cloning. For iif, SCTP's IPv4 code doesn't use it, so no change is necessary. IPv6 now can fetch it directly from original's IPv6 CB as the original skb is still accessible. In the future we probably can simplify sctp_v*_skb_iif() stuff, as sctp_v4_skb_iif() was called but it's return value not used, and now it's not even called, but such cleanup is out of scope for this change. Fixes: 90017accff61 ("sctp: Add GSO support") Signed-off-by: Marcelo Ricardo Leitner --- include/net/sctp/structs.h | 7 +++ include/net/sctp/ulpevent.h | 2 +- net/sctp/inqueue.c | 7 +++ net/sctp/ipv6.c | 9 - net/sctp/protocol.c | 1 + net/sctp/sm_statefuns.c | 3 ++- net/sctp/socket.c | 10 +++--- net/sctp/ulpevent.c | 10 +- 8 files changed, 38 insertions(+), 11 deletions(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 966c3a40039c12a7c525612594a51312d5de1d2a..f6f201de6fa46b3ca203c00f4970ca408edb6930 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1107,6 +1107,13 @@ struct sctp_input_cb { }; #define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) +static inline const struct sk_buff *sctp_gso_headskb(const struct sk_buff *skb) +{ + const struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; + + return chunk->head_skb ? : skb; +} + /* These bind address data fields common between endpoints and associations */ struct sctp_bind_addr { diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index aa342645dbce446186d55151c3f507cf0e165b44..2c098cd7e7e202b6fa96e97ccb56471df27cec91 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -48,11 +48,11 @@ */ struct sctp_ulpevent { struct sctp_association *asoc; + struct sctp_chunk *chunk; unsigned int rmem_len; __u32 ppid; __u32 tsn; __u32 cumtsn; - int iif; __u16 stream; __u16 ssn; __u16 flags; diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index edabbbdfca541b830526a7a52aee18c20680c19c..147d975b04559f7858b040b1f04dbc559ef2ec78 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -218,6 +218,13 @@ new_skb: chunk->has_asconf = 0; chunk->end_of_packet = 0; chunk->ecn_ce_done = 0; + if (chunk->head_skb) { + struct sctp_input_cb + *cb = SCTP_INPUT_CB(chunk->skb), + *head_cb = SCTP_INPUT_CB(chunk->head_skb); + + cb->chunk = head_cb->chunk; + } } chunk->chunk_hdr = ch; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0657d18a85bf7aa751a0456d0cc9adae3ff95e42..ae6f1a2178bab81fa14562bd1c37d1e7b1e3 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -420,6 +420,7 @@ static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb, addr->v6.sin6_flowinfo = 0; /* FIXME */ addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; + /* Always called on head skb, so this is safe */ sh = sctp_hdr(skb); if (is_saddr) { *port = sh->source; @@ -710,8 +711,7 @@ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) /* Where did this skb come from? */ static int sctp_v6_skb_iif(const struct sk_buff *skb) { - struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb; - return opt->iif; + return IP6CB(skb)->iif; } /* Was this packet marked by Explicit Congestion Notification? */ @@ -780,15 +780,14 @@ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, if (ip_hdr(skb)->version == 4) { addr->v4.sin_family = AF_INET;
[PATCH net-next 2/6] sctp: reorder sctp_ulpevent and shrink msg_flags
The next patch needs 8 bytes in there. sctp_ulpevent has a hole due to bad alignment; msg_flags is using 4 bytes while it actually uses only 2, so we shrink it, and iif member (4 bytes) which can be easily fetched from another place once the next patch is there, so we remove it and thus creating space for 8 bytes. Signed-off-by: Marcelo Ricardo Leitner --- include/net/sctp/ulpevent.h | 10 +- net/sctp/ulpevent.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index cccdcfd149736b315554d64c2a556e0ad6496fc8..aa342645dbce446186d55151c3f507cf0e165b44 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -48,15 +48,15 @@ */ struct sctp_ulpevent { struct sctp_association *asoc; - __u16 stream; - __u16 ssn; - __u16 flags; + unsigned int rmem_len; __u32 ppid; __u32 tsn; __u32 cumtsn; - int msg_flags; int iif; - unsigned int rmem_len; + __u16 stream; + __u16 ssn; + __u16 flags; + __u16 msg_flags; }; /* Retrieve the skb this event sits inside of. */ diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index d1e38308f6159c0e4da7db966c16afc6a956b554..706f5bc9f0c3083ab455ec78b963cd609a3a95b5 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -51,7 +51,7 @@ static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event); /* Initialize an ULP event from an given skb. */ static void sctp_ulpevent_init(struct sctp_ulpevent *event, - int msg_flags, + __u16 msg_flags, unsigned int len) { memset(event, 0, sizeof(struct sctp_ulpevent)); @@ -60,7 +60,7 @@ static void sctp_ulpevent_init(struct sctp_ulpevent *event, } /* Create a new sctp_ulpevent. */ -static struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags, +static struct sctp_ulpevent *sctp_ulpevent_new(int size, __u16 msg_flags, gfp_t gfp) { struct sctp_ulpevent *event; -- 2.7.4
Re: [patch v2] net/mlx5: missing error code in esw_create_offloads_fdb_table()
On Wed, Jul 13, 2016 at 02:48:44PM +0300, Dan Carpenter wrote: > We accidentally return success when we had intended to return an error > code. > > Fixes: 69697b6e2086 ('net/mlx5: E-Switch, Add support for the sriov offloads > mode') > Signed-off-by: Dan Carpenter > v2: return -ENOTSUPP instead --EINVAL > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c > b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c > index 1842dfb..7d982cf 100644 > +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c > @@ -183,6 +183,7 @@ static int esw_create_offloads_fdb_table(struct > mlx5_eswitch *esw, int nvports) > > root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB); > if (!root_ns) { > + err = -ENOTSUPP; Did you mean ENOTSUP? I thought ENOTSUPP was not to be used outside NFS, and isn't properly exported to userspace.. $ find /usr/include -name "*errno*" | xargs grep 524 Jason
Re: [PATCH net] sock_diag: invert socket destroy broadcast check
On Fri, Jun 24, 2016 at 6:22 PM, Willem de Bruijn wrote: > On Fri, Jun 24, 2016 at 4:41 PM, Eric W. Biederman > wrote: >> Willem de Bruijn writes: >> >>> From: Willem de Bruijn >>> >>> Socket destruction is only broadcast for a socket sk if a diag >>> listener is registered and sk is not a kernel socket. >>> >>> Invert the test to not even check for listeners for kernel sockets. >>> >>> The sock_diag_has_destroy_listeners invocation dereferences >>> sock_net(sk), which for kernel sockets can be invalid as they do not >>> take a reference on the network namespace. >> >> No. That isn't so. A kernel socket for a network namespace must be >> destroyed in the network namespace teardown. I spent some more time looking at this. inet_ctl_sock_destroy does not destroy the socket if there are still skbuff with a reference on it (or its sk_wmem_alloc). Skbs are orphaned when they leave the namespace through dev_forward_skb, but not when sent out a physical nic (correctly, that would break TSQ). The bug happened with macvlan on top of bonding on top of a physical nic. The macvlan lives in a temporary namespace. After the macvlan and network namespace are destroyed, the physical device has a TCP RST skb from net.ipv4->tcp_sk queued for tx completion. I have not able to reproduce this exact scenario, likely because tx completion handling is on the order of microseconds and not easily slowed sufficiently for testing. Using a tap device with skb_orphan commented out, I can cause the issue. Commenting out skb_orrphan is clearly a gross hack. The point I wanted to verify is that underlying device is not stopped --and its queues cleaned of skb-- when the macvlan device is destroyed. Network namespace teardown is complex. Am I missing a step that does prevents the above, or does this indeed sound feasible in principle (if very unlikely in practice)?
Re: Configure traffic class to bringup DCB in back-to-back setup
On 16-07-13 02:09 AM, ayuj wrote: > I just checked TLV's. Below are the details: > OK so not really a netdev discussion seeing its just a user space protocol setup issue. Going forward probably drop netdev and add intel-wired-lan. > OS :- CentOS 7.2 > kernel 3.10.0-327.el7.x86_64 > lldpad:- lldpad v0.9.46 > dcbtool:- v0.9.46 > ixgbe :- ixgbe-4.3.15 > > steps followed:- > > # modporbe ixgbe > # service lldpad start > Redirecting to /bin/systemctl start lldpad.service > > # service lldpad status > Redirecting to /bin/systemctl status lldpad.service > ● lldpad.service - Link Layer Discovery Protocol Agent Daemon. >Loaded: loaded (/usr/lib/systemd/system/lldpad.service; disabled; vendor > preset: disabled) >Active: active (running) since Tue 2016-07-05 05:49:12 EDT; 1s ago > Main PID: 133737 (lldpad) >CGroup: /system.slice/lldpad.service >└─133737 /usr/sbin/lldpad -t > > Jul 05 05:49:12 localhost.localdomain systemd[1]: Started Link Layer > Discovery Protocol Agent Daemon.. > Jul 05 05:49:12 localhost.localdomain systemd[1]: Starting Link Layer > Discovery Protocol Agent Daemon > > lldptool -t -i p3p2 -n > Chassis ID TLV > MAC: 00:1b:21:bb:2e:da > Port ID TLV > MAC: 00:1b:21:bb:2e:da > Time to Live TLV > 120 > IEEE 8021QAZ ETS Configuration TLV >Willing: yes >CBS: not supported >MAX_TCS: 8 >PRIO_MAP: 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0 >TC Bandwidth: 0% 0% 0% 0% 0% 0% 0% 0% >TSA_MAP: 0:strict 1:strict 2:strict 3:strict 4:strict 5:strict 6:strict > 7:strict > IEEE 8021QAZ PFC TLV >Willing: yes >MACsec Bypass Capable: no >PFC capable traffic classes: 8 >PFC enabled: none > End of LLDPDU TLV > > Please help me in configuring traffic classes. I want to bringup DCB setup > in a back-to-back senario. > So at the moment it appears to be configured to use 802.1QAZ spec which superseded the older spec even though lldpad supports both. Note the tool itself really requires some spec knowledge to use correctly. The spec to read is 802.1Q. To configure it back-to-back (typical scenario is connected to a DCB enabled switch where your administrator would setup the switch and this would autoneg just fine) the servers need to be setup manually. Perhaps reading if you haven't already the man page for lldptool and lldptool-ets, lldptool-pfc would help. From the ets man page this should kick things off, #lldptool -T -i eth2 -V ETS-CFG \ tsa=0:ets,1:ets,2:ets,3:ets,4:ets,5:ets,6:ets,7:ets \ up2tc=0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7 \ tcbw=12,12,12,12,13,13,13,13 #lldptool -T -i eth2 -V ETS-REC \ tsa=0:ets,1:ets,2:ets,3:ets,4:ets,5:ets,6:ets,7:ets \ up2tc=0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7 \ tcbw=12,12,12,12,13,13,13,13 Thanks, John
Re: [patch v2] net/mlx5: missing error code in esw_create_offloads_fdb_table()
On Wed, Jul 13, 2016 at 02:48:44PM +0300, Dan Carpenter wrote: > We accidentally return success when we had intended to return an error > code. > > Fixes: 69697b6e2086 ('net/mlx5: E-Switch, Add support for the sriov offloads > mode') > Signed-off-by: Dan Carpenter > --- > v2: return -ENOTSUPP instead --EINVAL I'm a little bit confused. Why did you prefer ENOTSUPP over EOPNOTSUPP? Thanks. signature.asc Description: Digital signature
[PATCH 1/2] net: nps_enet: fix coding style issues
From: Elad Kanfi Fix following coding style problems : ERROR: else should follow close brace '}' + } + else { /* !dst_is_aligned */ WARNING: Missing a blank line after declarations + u32 buf = nps_enet_reg_get(priv, NPS_ENET_REG_RX_BUF); + put_unaligned_be32(buf, reg); WARNING: Missing a blank line after declarations + u32 buf; + ioread32_rep(priv->regs_base + NPS_ENET_REG_RX_BUF, &buf, 1); CHECK: Blank lines aren't necessary before a close brace '}' + + } total: 1 errors, 2 warnings, 1 checks, 683 lines checked Signed-off-by: Elad Kanfi --- drivers/net/ethernet/ezchip/nps_enet.c |6 +++--- 1 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ezchip/nps_enet.c b/drivers/net/ethernet/ezchip/nps_enet.c index 06f0317..b182e2a 100644 --- a/drivers/net/ethernet/ezchip/nps_enet.c +++ b/drivers/net/ethernet/ezchip/nps_enet.c @@ -46,16 +46,17 @@ static void nps_enet_read_rx_fifo(struct net_device *ndev, if (dst_is_aligned) { ioread32_rep(priv->regs_base + NPS_ENET_REG_RX_BUF, reg, len); reg += len; - } - else { /* !dst_is_aligned */ + } else { /* !dst_is_aligned */ for (i = 0; i < len; i++, reg++) { u32 buf = nps_enet_reg_get(priv, NPS_ENET_REG_RX_BUF); + put_unaligned_be32(buf, reg); } } /* copy last bytes (if any) */ if (last) { u32 buf; + ioread32_rep(priv->regs_base + NPS_ENET_REG_RX_BUF, &buf, 1); memcpy((u8 *)reg, &buf, last); } @@ -459,7 +460,6 @@ static void nps_enet_set_rx_mode(struct net_device *ndev) | NPS_ENET_ENABLE << CFG_2_DISK_DA_SHIFT; ge_mac_cfg_2_value = (ge_mac_cfg_2_value & ~CFG_2_DISK_MC_MASK) | NPS_ENET_ENABLE << CFG_2_DISK_MC_SHIFT; - } nps_enet_reg_set(priv, NPS_ENET_REG_GE_MAC_CFG_2, ge_mac_cfg_2_value); -- 1.7.1
Re: [PATCH v7 09/11] net/mlx4_en: add xdp forwarding and data write support
On Wed, Jul 13, 2016 at 06:25:28PM +0300, Saeed Mahameed wrote: > On Tue, Jul 12, 2016 at 12:29 AM, Brenden Blanco wrote: > > A user will now be able to loop packets back out of the same port using > > a bpf program attached to xdp hook. Updates to the packet contents from > > the bpf program is also supported. > > > > For the packet write feature to work, the rx buffers are now mapped as > > bidirectional when the page is allocated. This occurs only when the xdp > > hook is active. > > > > When the program returns a TX action, enqueue the packet directly to a > > dedicated tx ring, so as to avoid completely any locking. This requires > > the tx ring to be allocated 1:1 for each rx ring, as well as the tx > > completion running in the same softirq. > > > > Upon tx completion, this dedicated tx ring recycles pages without > > unmapping directly back to the original rx ring. In steady state tx/drop > > workload, effectively 0 page allocs/frees will occur. > > > > Signed-off-by: Brenden Blanco > > --- > > drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 15 ++- > > drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 19 +++- > > drivers/net/ethernet/mellanox/mlx4/en_rx.c | 14 +++ > > drivers/net/ethernet/mellanox/mlx4/en_tx.c | 126 > > +++- > > drivers/net/ethernet/mellanox/mlx4/mlx4_en.h| 14 ++- > > 5 files changed, 181 insertions(+), 7 deletions(-) > > > > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c > > b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c > > index d3d51fa..10642b1 100644 > > --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c > > +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c > > @@ -1694,6 +1694,11 @@ static int mlx4_en_set_rxnfc(struct net_device *dev, > > struct ethtool_rxnfc *cmd) > > return err; > > } > > > > +static int mlx4_en_max_tx_channels(struct mlx4_en_priv *priv) > > +{ > > + return (MAX_TX_RINGS - priv->rsv_tx_rings) / MLX4_EN_NUM_UP; > > +} > > + > > MAX_TX_RING is a software limitation made to limit netdev real_num_tx > queues for CX3 internal cache utilization, > in your case the netdev doesn't care about xdp_tx rings, the > accounting you added in this patch adds a lot of > complications and it would be better to have clear separation between > the two types of tx_rings, in terms of the holding/managing data > structure. > > I suggest here to leave priv->tx_ring untouched. i.e, don't store the > xdp_tx rings at the end of it, i.e priv->tx_ring should only reflect > the > netdev real tx queues. > > In case of priv->porg is active, we can allocate and store xdp tx ring > per rx ring, this tx ring will be allocated and activated > once the rx ring is created and activated, and store this dedicated tx > ring in the rx_ring it self. > > i.e : > struct mlx4_en_rx_ring { > [...] > struct mlx4_en_tx_ring *xdp_tx; > struct mlx4_en_cq *xdp_tx_cq; > [...] > } > > for this the following changes are required. > > @ mlx4_en_create_rx_ring > [...] // Create the RX ring > > /* create CQ for xdp tx ring */ > node = cpu_to_node(i % num_online_cpus()); > > mlx4_en_create_cq(priv, &rx_ring->xdp_tx_cq, prof->tx_ring_size, i, TX, node) > > /* create xdp tx ring */ > mlx4_en_create_tx_ring(priv, &rx_ring->xdp_tx, prof->tx_ring_size, > TXBB_SIZE, node, -1) > > @mlx4_en_start/stop_port > /* Configure tx cq's and rings */ > // You will need to configure xdp tx rings same as priv->rx_ring_num rings > > @mlx4_en_poll_tx_cq > This Also will require a new NAPI handler for xdp rings to replace the > following line @mlx4_en_poll_tx_cq > - struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; > with > + struct mlx4_en_tx_ring *ring = priv->rx_ring[cq->ring].xdp_tx; > > Or just change cq->ring from ring index to the actual ring pointer. > > Bottom line, my suggestion also started to look complicated :).. but > still it would look cleaner to separate between netdev rings and xdp > rings. > I considered this at first too, but it seemed the worse option to me at the time. There would be a lot of copy/paste as well as new code to review. > > > static void mlx4_en_get_channels(struct net_device *dev, > > struct ethtool_channels *channel) > > { > > @@ -1705,7 +1710,7 @@ static void mlx4_en_get_channels(struct net_device > > *dev, > > channel->max_tx = MLX4_EN_MAX_TX_RING_P_UP; > > > > channel->rx_count = priv->rx_ring_num; > > - channel->tx_count = priv->tx_ring_num / MLX4_EN_NUM_UP; > > + channel->tx_count = priv->num_tx_rings_p_up; > > } > > > > static int mlx4_en_set_channels(struct net_device *dev, > > @@ -1717,7 +1722,7 @@ static int mlx4_en_set_channels(struct net_device > > *dev, > > int err = 0; > > > > if (channel->other_count || channel->combined_count || > > - channel->tx_count > MLX4_EN_MAX_TX_RING_P_UP || > > + channel->tx_count > mlx4_en_max_tx_channels(priv) || > > channel->rx_coun
Re: [PATCH 1/1] tracing, bpf: Implement function bpf_probe_write
On Wed, Jul 13, 2016 at 03:36:11AM -0700, Sargun Dhillon wrote: > Provides BPF programs, attached to kprobes a safe way to write to > memory referenced by probes. This is done by making probe_kernel_write > accessible to bpf functions via the bpf_probe_write helper. not quite :) > Signed-off-by: Sargun Dhillon > --- > include/uapi/linux/bpf.h | 3 +++ > kernel/trace/bpf_trace.c | 20 > samples/bpf/bpf_helpers.h | 2 ++ > 3 files changed, 25 insertions(+) > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 406459b..355b565 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -313,6 +313,9 @@ enum bpf_func_id { > */ > BPF_FUNC_skb_get_tunnel_opt, > BPF_FUNC_skb_set_tunnel_opt, > + > + BPF_FUNC_probe_write, /* int bpf_probe_write(void *dst, void *src, > int size) */ > + the patch is against some old kernel. Please always make the patch against net-next tree and cc netdev list. > +static u64 bpf_probe_write(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) > +{ > + void *dst = (void *) (long) r1; > + void *unsafe_ptr = (void *) (long) r2; > + int size = (int) r3; > + > + return probe_kernel_write(dst, unsafe_ptr, size); > +} the patch is whitepsace mangled. Please see Documentation/networking/netdev-FAQ.txt the main issue though that we cannot simply allow bpf to do probe_write, since it may crash the kernel. What might be ok is to allow writing into memory of current user space process only. This way bpf prog will keep kernel safety guarantees, yet it will be able to modify user process memory when necessary. Since bpf+tracing is root only, it doesn't pose security risk.
Re: [patch] net/mlx5: missing error code in esw_create_offloads_fdb_table()
On 13/07/2016 13:08, Dan Carpenter wrote: We accidentally return success when we had intended to return an error code. Fixes: 69697b6e2086 ('net/mlx5: E-Switch, Add support for the sriov offloads mode') Signed-off-by: Dan Carpenter diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 1842dfb..7d982cf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -183,6 +183,7 @@ static int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports) root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB); if (!root_ns) { + err = -EINVAL; esw_warn(dev, "Failed to get FDB flow namespace\n"); goto ns_err; } Hi, Thanks for the patch. I'm not sure EINVAL is the right error here though. Maybe -ENOTSUPP is a bit more appropriate here. Regards, Matan
Re: [PATCH net-next 1/3] perf, events: add non-linear data support for raw records
On 07/13/2016 06:40 PM, Peter Zijlstra wrote: On Wed, Jul 13, 2016 at 04:08:55PM +0200, Daniel Borkmann wrote: On 07/13/2016 03:42 PM, Peter Zijlstra wrote: Ok so the nonlinear thing was it doing _two_ copies, one the regular __output_copy() on raw->data and second the optional fragment thingy using __output_custom(). Would something like this work instead? It does the nonlinear thing and the custom copy function thing but allows more than 2 fragments and allows each fragment to have a custom copy. It doesn't look obviously more expensive; it has the one ->copy branch extra, but then it doesn't recompute the sizes. Yes, that would work as well on a quick glance with diff just a bit bigger, but more generic this way. Do you want me to adapt this into the first patch? Please. One question below: - u64 zero = 0; - if (real_size - raw_size) - __output_copy(handle, &zero, real_size - raw_size); We still need the zero padding here from above with the computed raw->size, right? Ah, yes, we need some __output*() in order to advance the handle offset. We don't _need_ to copy the 0s, but I doubt __output_skip() is much cheaper for these 1-3 bytes worth of data; we've already touched that line anyway. Okay, thanks for your input! I'll respin then.
[PATCH] bonding: set carrier off for devices created through netlink
Commit e826eafa65c6 ("bonding: Call netif_carrier_off after register_netdevice") moved netif_carrier_off() from bond_init() to bond_create(), but the latter is called only for initial default devices and ones created through sysfs: $ modprobe bonding $ echo +bond1 > /sys/class/net/bonding_masters $ ip link add bond2 type bond $ grep "MII Status" /proc/net/bonding/* /proc/net/bonding/bond0:MII Status: down /proc/net/bonding/bond1:MII Status: down /proc/net/bonding/bond2:MII Status: up Ensure that carrier is initially off also for devices created through netlink. Signed-off-by: Beniamino Galvani --- drivers/net/bonding/bond_netlink.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index db760e8..b8df0f5 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -446,7 +446,11 @@ static int bond_newlink(struct net *src_net, struct net_device *bond_dev, if (err < 0) return err; - return register_netdevice(bond_dev); + err = register_netdevice(bond_dev); + + netif_carrier_off(bond_dev); + + return err; } static size_t bond_get_size(const struct net_device *bond_dev) -- 2.5.5
[PATCH] rndis_host: Set random MAC for ZTE MF910
From: Kristian Evensen All ZTE MF910 mifis, at least on some revisions, export the same MAC address (36:4b:50:b7:ef:da). Check for this MAC address and set a random MAC if detected. Also, changed the memcpy() to ether_addr_copy(), as pointed out by checkpatch. Signed-off-by: Kristian Evensen --- drivers/net/usb/rndis_host.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/rndis_host.c b/drivers/net/usb/rndis_host.c index 524a47a281..85bdbdf 100644 --- a/drivers/net/usb/rndis_host.c +++ b/drivers/net/usb/rndis_host.c @@ -295,6 +295,9 @@ static const struct net_device_ops rndis_netdev_ops = { .ndo_validate_addr = eth_validate_addr, }; +/* well-known buggy ZTE MF910 MAC address */ +static const u8 buggy_zte_addr[ETH_ALEN] = {0x36, 0x4b, 0x50, 0xb7, 0xef, 0xda}; + int generic_rndis_bind(struct usbnet *dev, struct usb_interface *intf, int flags) { @@ -428,7 +431,11 @@ generic_rndis_bind(struct usbnet *dev, struct usb_interface *intf, int flags) dev_err(&intf->dev, "rndis get ethaddr, %d\n", retval); goto halt_fail_and_release; } - memcpy(net->dev_addr, bp, ETH_ALEN); + + if (ether_addr_equal(bp, buggy_zte_addr)) + eth_hw_addr_random(net); + else + ether_addr_copy(net->dev_addr, bp); /* set a nonzero filter to enable data transfers */ memset(u.set, 0, sizeof *u.set); -- 2.5.0
Re: [PATCH net-next 1/3] perf, events: add non-linear data support for raw records
On Wed, Jul 13, 2016 at 04:08:55PM +0200, Daniel Borkmann wrote: > Hi Peter, > > On 07/13/2016 03:42 PM, Peter Zijlstra wrote: > > > >Ok so the nonlinear thing was it doing _two_ copies, one the regular > >__output_copy() on raw->data and second the optional fragment thingy > >using __output_custom(). > > > >Would something like this work instead? > > > >It does the nonlinear thing and the custom copy function thing but > >allows more than 2 fragments and allows each fragment to have a custom > >copy. > > > >It doesn't look obviously more expensive; it has the one ->copy branch > >extra, but then it doesn't recompute the sizes. > > Yes, that would work as well on a quick glance with diff just a bit > bigger, but more generic this way. Do you want me to adapt this into > the first patch? Please. > One question below: > > >-u64 zero = 0; > >-if (real_size - raw_size) > >-__output_copy(handle, &zero, real_size - > >raw_size); > > We still need the zero padding here from above with the computed > raw->size, right? Ah, yes, we need some __output*() in order to advance the handle offset. We don't _need_ to copy the 0s, but I doubt __output_skip() is much cheaper for these 1-3 bytes worth of data; we've already touched that line anyway.
Re: [PATCH v3] Marvell phy: add fiber status check and configuration for some phys
> +static int marvell_resume_fiber(struct phy_device *phydev) > +{ > +int err; > + > +/* Resume the fiber mode first */ > +err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M_FIBER); > +if (err < 0) > +goto error; > + > +err = genphy_resume(phydev); > +if (err < 0) > +goto error; > + > +/* Then, the copper link */ > +err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M_COPPER); > +if (err < 0) > +goto error; > + > +return genphy_resume(phydev); > >>> > >>> Should it be resumed twice? Or just once at the end? Same question > >>> for suspend. > >> > >> I don't understand your question. > > > > You call genphy_resume(phydev) twice. Once is sufficient. > > Yes, but it's normal because each interface could be suspended or resumed > independently. > genphy_* functions use BMCR register which are identical between > fiber and copper link. But each link has its own register to change. Ah! Now i get it. I think you need a comment here. Something like: /* With the page set, use the generic resume */ What i was worried about is that there is some reference counting going on inside these functions. And so suspending the same phydev multiple times will mess up the reference counts. But no, it just twiddles a register bit, so that is O.K. Andrew
Re: [PATCH v8 06/11] net/mlx4_en: add page recycle to prepare rx ring for tx support
On Wed, Jul 13, 2016 at 10:17:26AM +0300, Tariq Toukan wrote: > > On 13/07/2016 3:54 AM, Brenden Blanco wrote: > >On Tue, Jul 12, 2016 at 02:18:32PM -0700, David Miller wrote: > >>From: Brenden Blanco > >>Date: Tue, 12 Jul 2016 00:51:29 -0700 > >> > >>>+ mlx4_en_free_resources(priv); > >>>+ > >>> old_prog = xchg(&priv->prog, prog); > >>> if (old_prog) > >>> bpf_prog_put(old_prog); > >>>- return 0; > >>>+ err = mlx4_en_alloc_resources(priv); > >>>+ if (err) { > >>>+ en_err(priv, "Failed reallocating port resources\n"); > >>>+ goto out; > >>>+ } > >>>+ if (port_up) { > >>>+ err = mlx4_en_start_port(dev); > >>>+ if (err) > >>>+ en_err(priv, "Failed starting port\n"); > >>A failed configuration operation should _NEVER_ leave the interface in > >>an inoperative state like these error paths do. > >> > >>You must instead preallocate the necessary resources, and only change > >>the chip's configuration and commit to the new settings once you have > >>successfully allocated those resources. > >I'll see what I can do here. > That's exactly what we're doing in a patchset that will be submitted > to net very soon (this week). Thanks Tariq! As an example, I had originally tried to integrate this code into mlx4_en_set_channels, which seems to have the same problem. > It fixes/refactors these failure flows just like Dave described, > something like: > > err = mlx4_en_try_alloc_resources(priv, tmp, &new_prof); > if (err) > goto out; > > if (priv->port_up) { > port_up = 1; > mlx4_en_stop_port(dev, 1); > } > > mlx4_en_safe_replace_resources(priv, tmp); > > if (port_up) { > err = mlx4_en_start_port(dev); > if (err) > en_err(priv, "Failed starting port\n"); > } > > I suggest you keep your code aligned with current net-next driver, > and later I will take it and fix it (once merged with net). Another option is to avoid entirely the tx_ring_num change, so as to keep the majority of the initialized state valid. We would only allocate a new set of pages and refill the rx rings once we have confirmed there are enough resources. So others can follow the discussion, there are multiple reasons to reconfigure the rings. 1. The rx frags should be page-per-packet 2. The pages should be mapped DMA_BIDIRECTIONAL 3. Each rx ring should have a dedicated tx ring, which is off limits from the upper stack 4. The dedicated tx ring will have a pointer back to its rx ring for recycling #1 and #2 can be done to the side ahead of time, as you are also suggesting. Currently, to achieve #3, we increase tx_ring_num while keeping num_tx_rings_p_up the same. This precipitates a round of free/alloc_resources, which takes some time and has many opportunities for failure. However, we could resurrect an earlier approach that keeps the tx_ring_num unchanged, and instead just do a netif_set_real_num_tx_queues(tx_ring_num - rsv_tx_rings) to hide it from the stack. This would require that there be enough rings ahead of time, with a simple bounds check like: if (tx_ring_num < rsv_tx_rings + MLX4_EN_MAX_TX_RING_P_UP) { en_err(priv, "XDP requires minimum %d + %d rings\n", rsv_tx_rings, MLX4_EN_MAX_TX_RING_P_UP); return -EINVAL; } The default values for tx_ring_num and rx_ring_num will only hit this case when operating in a low memory environment, in which case the user must increase the number of channels manually. I think that is a fair tradeoff. The rest of #1, #2, and #4 can be done in a guaranteed fashion once the buffers are allocated, since it would just be a few loops to refresh the rx_desc and recycle_ring. > > Regards, > Tariq
Re: [patch v2] net/mlx5: missing error code in esw_create_offloads_fdb_table()
On 13/07/2016 14:48, Dan Carpenter wrote: We accidentally return success when we had intended to return an error code. Fixes: 69697b6e2086 ('net/mlx5: E-Switch, Add support for the sriov offloads mode') Signed-off-by: Dan Carpenter --- v2: return -ENOTSUPP instead --EINVAL diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 1842dfb..7d982cf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -183,6 +183,7 @@ static int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports) root_ns = mlx5_get_flow_namespace(dev, MLX5_FLOW_NAMESPACE_FDB); if (!root_ns) { + err = -ENOTSUPP; esw_warn(dev, "Failed to get FDB flow namespace\n"); goto ns_err; } Thanks. Reviewed-by: Matan Barak
Re: [PATCH v7 09/11] net/mlx4_en: add xdp forwarding and data write support
On Tue, Jul 12, 2016 at 12:29 AM, Brenden Blanco wrote: > A user will now be able to loop packets back out of the same port using > a bpf program attached to xdp hook. Updates to the packet contents from > the bpf program is also supported. > > For the packet write feature to work, the rx buffers are now mapped as > bidirectional when the page is allocated. This occurs only when the xdp > hook is active. > > When the program returns a TX action, enqueue the packet directly to a > dedicated tx ring, so as to avoid completely any locking. This requires > the tx ring to be allocated 1:1 for each rx ring, as well as the tx > completion running in the same softirq. > > Upon tx completion, this dedicated tx ring recycles pages without > unmapping directly back to the original rx ring. In steady state tx/drop > workload, effectively 0 page allocs/frees will occur. > > Signed-off-by: Brenden Blanco > --- > drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 15 ++- > drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 19 +++- > drivers/net/ethernet/mellanox/mlx4/en_rx.c | 14 +++ > drivers/net/ethernet/mellanox/mlx4/en_tx.c | 126 > +++- > drivers/net/ethernet/mellanox/mlx4/mlx4_en.h| 14 ++- > 5 files changed, 181 insertions(+), 7 deletions(-) > > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c > b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c > index d3d51fa..10642b1 100644 > --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c > +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c > @@ -1694,6 +1694,11 @@ static int mlx4_en_set_rxnfc(struct net_device *dev, > struct ethtool_rxnfc *cmd) > return err; > } > > +static int mlx4_en_max_tx_channels(struct mlx4_en_priv *priv) > +{ > + return (MAX_TX_RINGS - priv->rsv_tx_rings) / MLX4_EN_NUM_UP; > +} > + MAX_TX_RING is a software limitation made to limit netdev real_num_tx queues for CX3 internal cache utilization, in your case the netdev doesn't care about xdp_tx rings, the accounting you added in this patch adds a lot of complications and it would be better to have clear separation between the two types of tx_rings, in terms of the holding/managing data structure. I suggest here to leave priv->tx_ring untouched. i.e, don't store the xdp_tx rings at the end of it, i.e priv->tx_ring should only reflect the netdev real tx queues. In case of priv->porg is active, we can allocate and store xdp tx ring per rx ring, this tx ring will be allocated and activated once the rx ring is created and activated, and store this dedicated tx ring in the rx_ring it self. i.e : struct mlx4_en_rx_ring { [...] struct mlx4_en_tx_ring *xdp_tx; struct mlx4_en_cq *xdp_tx_cq; [...] } for this the following changes are required. @ mlx4_en_create_rx_ring [...] // Create the RX ring /* create CQ for xdp tx ring */ node = cpu_to_node(i % num_online_cpus()); mlx4_en_create_cq(priv, &rx_ring->xdp_tx_cq, prof->tx_ring_size, i, TX, node) /* create xdp tx ring */ mlx4_en_create_tx_ring(priv, &rx_ring->xdp_tx, prof->tx_ring_size, TXBB_SIZE, node, -1) @mlx4_en_start/stop_port /* Configure tx cq's and rings */ // You will need to configure xdp tx rings same as priv->rx_ring_num rings @mlx4_en_poll_tx_cq This Also will require a new NAPI handler for xdp rings to replace the following line @mlx4_en_poll_tx_cq - struct mlx4_en_tx_ring *ring = priv->tx_ring[cq->ring]; with + struct mlx4_en_tx_ring *ring = priv->rx_ring[cq->ring].xdp_tx; Or just change cq->ring from ring index to the actual ring pointer. Bottom line, my suggestion also started to look complicated :).. but still it would look cleaner to separate between netdev rings and xdp rings. > static void mlx4_en_get_channels(struct net_device *dev, > struct ethtool_channels *channel) > { > @@ -1705,7 +1710,7 @@ static void mlx4_en_get_channels(struct net_device *dev, > channel->max_tx = MLX4_EN_MAX_TX_RING_P_UP; > > channel->rx_count = priv->rx_ring_num; > - channel->tx_count = priv->tx_ring_num / MLX4_EN_NUM_UP; > + channel->tx_count = priv->num_tx_rings_p_up; > } > > static int mlx4_en_set_channels(struct net_device *dev, > @@ -1717,7 +1722,7 @@ static int mlx4_en_set_channels(struct net_device *dev, > int err = 0; > > if (channel->other_count || channel->combined_count || > - channel->tx_count > MLX4_EN_MAX_TX_RING_P_UP || > + channel->tx_count > mlx4_en_max_tx_channels(priv) || > channel->rx_count > MAX_RX_RINGS || > !channel->tx_count || !channel->rx_count) > return -EINVAL; > @@ -1731,7 +1736,8 @@ static int mlx4_en_set_channels(struct net_device *dev, > mlx4_en_free_resources(priv); > > priv->num_tx_rings_p_up = channel->tx_count; > - priv->tx_ring_num = channel->tx_count * MLX4_EN_NUM_UP; > + priv->tx_ring_num = channel->tx_count * MLX4_EN_NUM_UP + > +
Re: linux-next: Tree for Jul 13 (net/core/devlink with Tracing)
On 07/12/16 23:47, Stephen Rothwell wrote: > Hi all, > > Changes since 20160712: > on x86_64: (full randconfig file is attached) CC net/core/devlink.o In file included from ../include/trace/define_trace.h:95:0, from ../include/trace/events/devlink.h:51, from ../net/core/devlink.c:30: ../include/trace/events/devlink.h: In function 'trace_event_get_offsets_devlink_hwmsg': ../include/trace/events/devlink.h:25:51: error: dereferencing pointer to incomplete type __string(owner_name, devlink->dev->driver->owner->name) ^ ../include/trace/trace_events.h:501:2: note: in definition of macro 'DECLARE_EVENT_CLASS' tstruct; \ ^ ../include/trace/trace_events.h:63:9: note: in expansion of macro 'PARAMS' PARAMS(tstruct), \ ^ ../include/trace/events/devlink.h:16:1: note: in expansion of macro 'TRACE_EVENT' TRACE_EVENT(devlink_hwmsg, ^ ../include/trace/events/devlink.h:22:2: note: in expansion of macro 'TP_STRUCT__entry' TP_STRUCT__entry( ^ ../include/trace/trace_events.h:466:29: note: in expansion of macro '__dynamic_array' #define __string(item, src) __dynamic_array(char, item, \ ^ ../include/trace/events/devlink.h:25:3: note: in expansion of macro '__string' __string(owner_name, devlink->dev->driver->owner->name) ^ ../include/trace/events/devlink.h:25:51: error: dereferencing pointer to incomplete type __string(owner_name, devlink->dev->driver->owner->name) ^ ../include/trace/trace_events.h:501:2: note: in definition of macro 'DECLARE_EVENT_CLASS' tstruct; \ ^ ../include/trace/trace_events.h:63:9: note: in expansion of macro 'PARAMS' PARAMS(tstruct), \ ^ ../include/trace/events/devlink.h:16:1: note: in expansion of macro 'TRACE_EVENT' TRACE_EVENT(devlink_hwmsg, ^ ../include/trace/events/devlink.h:22:2: note: in expansion of macro 'TP_STRUCT__entry' TP_STRUCT__entry( ^ ../include/trace/trace_events.h:466:29: note: in expansion of macro '__dynamic_array' #define __string(item, src) __dynamic_array(char, item, \ ^ ../include/trace/events/devlink.h:25:3: note: in expansion of macro '__string' __string(owner_name, devlink->dev->driver->owner->name) ^ In file included from ../include/trace/define_trace.h:95:0, from ../include/trace/events/devlink.h:51, from ../net/core/devlink.c:30: ../include/trace/events/devlink.h: In function 'trace_event_raw_event_devlink_hwmsg': ../include/trace/events/devlink.h:35:55: error: dereferencing pointer to incomplete type __assign_str(owner_name, devlink->dev->driver->owner->name); ^ ../include/trace/trace_events.h:686:4: note: in definition of macro 'DECLARE_EVENT_CLASS' { assign; } \ ^ ../include/trace/trace_events.h:64:9: note: in expansion of macro 'PARAMS' PARAMS(assign), \ ^ ../include/trace/events/devlink.h:16:1: note: in expansion of macro 'TRACE_EVENT' TRACE_EVENT(devlink_hwmsg, ^ ../include/trace/events/devlink.h:32:2: note: in expansion of macro 'TP_fast_assign' TP_fast_assign( ^ ../include/trace/events/devlink.h:35:3: note: in expansion of macro '__assign_str' __assign_str(owner_name, devlink->dev->driver->owner->name); ^ ../include/trace/events/devlink.h:35:55: error: dereferencing pointer to incomplete type __assign_str(owner_name, devlink->dev->driver->owner->name); ^ ../include/trace/trace_events.h:686:4: note: in definition of macro 'DECLARE_EVENT_CLASS' { assign; } \ ^ ../include/trace/trace_events.h:64:9: note: in expansion of macro 'PARAMS' PARAMS(assign), \ ^ ../include/trace/events/devlink.h:16:1: note: in expansion of macro 'TRACE_EVENT' TRACE_EVENT(devlink_hwmsg, ^ ../include/trace/events/devlink.h:32:2: note: in expansion of macro 'TP_fast_assign' TP_fast_assign( ^ ../include/trace/events/devlink.h:35:3: note: in expansion of macro '__assign_str' __assign_str(owner_name, devlink->dev->driver->owner->name); ^ In file included from ../include/trace/define_trace.h:96:0, from ../include/trace/events/devlink.h:51, from ../net/core/devlink.c:30: ../include/trace/events/devlink.h: In function 'perf_trace_devlink_hwmsg': ../include/trace/events/devlink.h:35:55: error: dereferencing pointer to incomplete type __assign_str(owner_name, devlink->dev->driver->owner->name); ^ ../include/trace/perf.h:65:4: note: in definition of macro 'DECLARE_EVENT_CLASS' { assign; } \ ^ ../include/trace/trace_events.h:64:9: note: in expansion of macro 'PARAMS' PARAMS(assign), \ ^ ../include/trace
Re: [PATCH v8 00/11] Add driver bpf hook for early packet drop and forwarding
On 12/07/2016 5:38 PM, Tariq Toukan wrote: Regression tests for mlx4_en are currently running, results will be ready by tomorrow morning. Functional regression results look fine. Regards, Tariq
Re: [PATCH v8 04/11] net/mlx4_en: add support for fast rx drop bpf program
On Wed, Jul 13, 2016 at 11:27:23AM +, David Laight wrote: > From: Brenden Blanco > > Sent: 12 July 2016 08:51 > > Add support for the BPF_PROG_TYPE_XDP hook in mlx4 driver. > > > > In tc/socket bpf programs, helpers linearize skb fragments as needed > > when the program touches the packet data. However, in the pursuit of > > speed, XDP programs will not be allowed to use these slower functions, > > especially if it involves allocating an skb. > > > > Therefore, disallow MTU settings that would produce a multi-fragment > > packet that XDP programs would fail to access. Future enhancements could > > be done to increase the allowable MTU. > > Maybe I'm misunderstanding what is going on here... > But what has the MTU to do with how skb are fragmented? This is mlx4 specific...depending on the MTU the driver will write data into 1536, 1536+4096, 1536+4096+4096, etc. fragments. > > If the skb come from a reasonably written USB ethernet interface they could > easily have arbitrary fragment boundaries (the frames get packed into USB > buffers). The XDP program is operating directly on the packet memory, before any skb has been allocated. The program also expects a continguous memory region to inspect...it's too expensive to linearize the data like we do in the tc hook case, that's a feature that costs too much for this type of low level feature. Therefore, XDP can only be turned on in combination with a cooperative driver, that's the performance tradeoff we're imposing here. > > Outbound skb can also have fragments depending on how they are generated. Sure, but XDP won't run on those. This is an rx-only feature. > > David
Re: [PATCH net-next 1/3] perf, events: add non-linear data support for raw records
Hi Peter, On 07/13/2016 03:42 PM, Peter Zijlstra wrote: Ok so the nonlinear thing was it doing _two_ copies, one the regular __output_copy() on raw->data and second the optional fragment thingy using __output_custom(). Would something like this work instead? It does the nonlinear thing and the custom copy function thing but allows more than 2 fragments and allows each fragment to have a custom copy. It doesn't look obviously more expensive; it has the one ->copy branch extra, but then it doesn't recompute the sizes. Yes, that would work as well on a quick glance with diff just a bit bigger, but more generic this way. Do you want me to adapt this into the first patch? One question below: diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1fe22032f228..83e2a83e8db3 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -69,9 +69,18 @@ struct perf_callchain_entry_ctx { boolcontexts_maxed; }; +typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long len); + +struct perf_raw_frag { + struct perf_raw_frag*next; + perf_copy_f copy; + void*data; + u32 size; +} __packed; + struct perf_raw_record { + struct perf_raw_fragfrag; u32 size; - void*data; }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index fe8d49a56322..f7ad7d65317d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5617,16 +5617,21 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - u32 raw_size = data->raw->size; - u32 real_size = round_up(raw_size + sizeof(u32), -sizeof(u64)) - sizeof(u32); - u64 zero = 0; - - perf_output_put(handle, real_size); - __output_copy(handle, data->raw->data, raw_size); - if (real_size - raw_size) - __output_copy(handle, &zero, real_size - raw_size); + struct perf_raw_record *raw = data->raw; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + + perf_output_put(handle, raw->size); + do { + if (frag->copy) { + __output_custom(handle, frag->copy, + frag->data, frag->size); + } else { + __output_copy(handle, frag->data, frag->size); + } + frag = frag->next; + } while (frag); We still need the zero padding here from above with the computed raw->size, right? } else { struct { u32 size; @@ -5751,14 +5756,22 @@ void perf_prepare_sample(struct perf_event_header *header, Thanks, Daniel
Re: [PATCH] net: ip_finish_output_gso: If skb_gso_network_seglen exceeds MTU, do segmentation even for non IPSKB_FORWARDED skbs
Hi Florian, Hannes, On Tue, 12 Jul 2016 08:56:56 +0300 Shmulik Ladkani wrote: > On Sat, 9 Jul 2016 15:22:30 +0200 Florian Westphal wrote: > > > > > > > What about setting IPCB FORWARD flag in iptunnel_xmit if > > > > skb->skb_iif != 0... instead? > > I've came up with a suggestion that does not abuse IPSKB_FORWARDED, > while properly addressing the use case (and similar ones), without > introducing the cost of entering 'skb_gso_validate_mtu' in the local > case. > > How about: > > @@ -220,12 +220,15 @@ static int ip_finish_output_gso(struct net *net, struct > sock *sk, > struct sk_buff *skb, unsigned int mtu) > { > netdev_features_t features; > + int local_trusted_gso; > struct sk_buff *segs; > int ret = 0; > > - /* common case: locally created skb or seglen is <= mtu */ > - if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || > - skb_gso_validate_mtu(skb, mtu)) > + local_trusted_gso = (IPCB(skb)->flags & IPSKB_FORWARDED) == 0 && > + !(skb_shinfo(skb)->gso_type & SKB_GSO_DODGY); > + /* common case: locally created skb from a trusted gso source or > + * seglen is <= mtu */ > + if (local_trusted_gso || skb_gso_validate_mtu(skb, mtu)) > return ip_finish_output2(net, sk, skb); > > /* Slowpath - GSO segment length is exceeding the dst MTU. > > This well addresses the usecase where we have gso-skb arriving from an > untrusted source, thus its gso_size is out of our control (e.g. tun/tap, > macvtap, af_packet, xen-netfront...). > > Locally "gso trusted" skbs (the common case) will NOT suffer the > additional (possibly costy) call to 'skb_gso_validate_mtu'. > > Also, if IPSKB_FORWARDED is true, behavior stays exactly the same. Any commnets regarding the latest suggestion above? I'd like to post it as v2 - if it is in the right direction. It handles the problem of gso_size values which are not in host's control, it addresses the usecase described, and has a benefit of not overloading IPSKB_FORWARDED with a new semantic that might be hard to maintain. PS: Also, if we'd like to pinpoint it even further, we can: local_trusted_gso = (IPCB(skb)->flags & IPSKB_FORWARDED) == 0 && (!sk || !(skb_shinfo(skb)->gso_type & SKB_GSO_DODGY)); Which ensures only the following conditions go to the expensive skb_gso_validate_mtu: 1. IPSKB_FORWARDED is on 2. IPSKB_FORWARDED is off, but sk exists and gso_size is untrusted. Meaning: we have a packet arriving from higher layers (sk is set) with a gso_size out of host's control. This fine-tuining leaves standard l2 bridging case (e.g 2x taps bridged) of a gso skb unaffected, as sk would be NULL. Many thanks, Shmulik
[PATCH 0/2] Code style fixes
From: Elad Kanfi Fix all checkpatch warnings and errors, and reuse code Elad Kanfi (2): net: nps_enet: fix coding style issues net: nps_enet: code reuse drivers/net/ethernet/ezchip/nps_enet.c | 27 ++- 1 files changed, 14 insertions(+), 13 deletions(-)
Re: [RFC PATCH v3] net: sched: convert qdisc linked list to hashtable
On Tue, 12 Jul 2016, Cong Wang wrote: > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > > index f45929c..0b5c172e 100644 > > --- a/include/linux/netdevice.h > > +++ b/include/linux/netdevice.h > > @@ -52,6 +52,7 @@ > > #include > > #include > > #include > > +#include > > > > struct netpoll_info; > > struct device; > > @@ -1778,6 +1779,7 @@ struct net_device { > > unsigned intnum_tx_queues; > > unsigned intreal_num_tx_queues; > > struct Qdisc*qdisc; > > + DECLARE_HASHTABLE (qdisc_hash, 4); > > unsigned long tx_queue_len; > > spinlock_t tx_global_lock; > > int watchdog_timeo; > > Should it be surrounded by CONFIG_NET_SCHED? > To save several bytes for !CONFIG_NET_SCHED case. Makes sense. I'll wait a bit for more feedback (if there is any) before including this in potential v4. Thanks, -- Jiri Kosina SUSE Labs
Re: [PATCH v3] Marvell phy: add fiber status check and configuration for some phys
Hi Andrew, Le 13/07/2016 à 15:26, Andrew Lunn a écrit : * * Generic status code does not detect Fiber correctly! @@ -906,12 +1070,17 @@ static int marvell_read_status(struct phy_device *phydev) int lpa; int lpagb; int status = 0; + int page, fiber; - /* Update the link, but return if there + /* Detect and update the link, but return if there * was an error */ - err = genphy_update_link(phydev); - if (err) - return err; + page = phy_read(phydev, MII_MARVELL_PHY_PAGE); + if (page == MII_M_FIBER) + fiber = 1; + else + fiber = 0; >>> >>> This read is expensive, since the MDIO bus is slow. It would be better >>> just to pass fibre as a parameter. >> >> But this function is used for other Marvell's phy, without fiber link for >> example. >> And this function should has only the struct phy_device as parameter. >> >> I don't have idea to avoid that, without create a custom function for that >> which would be very similar to this function. >> Or used a phy_device field for that? I think it's awful idea... > > So i would have > > static int marvell_read_status_page(struct phy_device *phydev, int page) > {} > > basically doing what you have above, but without the read. > > static int marvell_read_status(struct phy_device *phydev) > { > if (phydev->supported & SUPPORTED_FIBRE) { > marvell_read_status_page(phydev, MII_M_FIBER); > if (phydev->link) > return; > > return marvell_read_status_page(phydev, MII_M_COPPER); > } Oh I see. Thank you! > >>> I think it would be better to look for SUPPORTED_FIBRE in >>> drv->features, rather than have two different functions. >>> >>> In fact, i would do that in general, rather than add your _fibre() >>> functions. >> >> So, you suggest to do that in genphy_* functions or create marvell_* >> functions with this condition? >> I'm agree with the second suggestion. > > The second. I'm working on this. It's done for _resume and _suspend. It will be done for _status. But, for aneg or ethtool concerned, I think adding these functions is better. + +/* marvell_resume_fiber + * + * Some Marvell's phys have two modes: fiber and copper. + * Both need to be resumed + */ +static int marvell_resume_fiber(struct phy_device *phydev) +{ + int err; + + /* Resume the fiber mode first */ + err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M_FIBER); + if (err < 0) + goto error; + + err = genphy_resume(phydev); + if (err < 0) + goto error; + + /* Then, the copper link */ + err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M_COPPER); + if (err < 0) + goto error; + + return genphy_resume(phydev); >>> >>> Should it be resumed twice? Or just once at the end? Same question >>> for suspend. >> >> I don't understand your question. > > You call genphy_resume(phydev) twice. Once is sufficient. Yes, but it's normal because each interface could be suspended or resumed independently. genphy_* functions use BMCR register which are identical between fiber and copper link. But each link has its own register to change. Thank you. Regards. Charles-Antoine Couret
Re: [PATCH net-next 1/3] perf, events: add non-linear data support for raw records
Ok so the nonlinear thing was it doing _two_ copies, one the regular __output_copy() on raw->data and second the optional fragment thingy using __output_custom(). Would something like this work instead? It does the nonlinear thing and the custom copy function thing but allows more than 2 fragments and allows each fragment to have a custom copy. It doesn't look obviously more expensive; it has the one ->copy branch extra, but then it doesn't recompute the sizes. --- diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1fe22032f228..83e2a83e8db3 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -69,9 +69,18 @@ struct perf_callchain_entry_ctx { boolcontexts_maxed; }; +typedef unsigned long (*perf_copy_f)(void *dst, const void *src, unsigned long len); + +struct perf_raw_frag { + struct perf_raw_frag*next; + perf_copy_f copy; + void*data; + u32 size; +} __packed; + struct perf_raw_record { + struct perf_raw_fragfrag; u32 size; - void*data; }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index fe8d49a56322..f7ad7d65317d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5617,16 +5617,21 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - u32 raw_size = data->raw->size; - u32 real_size = round_up(raw_size + sizeof(u32), -sizeof(u64)) - sizeof(u32); - u64 zero = 0; - - perf_output_put(handle, real_size); - __output_copy(handle, data->raw->data, raw_size); - if (real_size - raw_size) - __output_copy(handle, &zero, real_size - raw_size); + struct perf_raw_record *raw = data->raw; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + + perf_output_put(handle, raw->size); + do { + if (frag->copy) { + __output_custom(handle, frag->copy, + frag->data, frag->size); + } else { + __output_copy(handle, frag->data, frag->size); + } + frag = frag->next; + } while (frag); } else { struct { u32 size; @@ -5751,14 +5756,22 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); + struct perf_raw_record *raw = data->raw; + int size = sizeof(u64); - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); + if (raw) { + struct perf_raw_frag *frag = &raw->frag; - header->size += round_up(size, sizeof(u64)); + size = sizeof(u32); + do { + size += frag->size; + frag = frag->next; + } while (frag) + size = round_up(size, sizeof(u64)); + raw->size = size; + } + + header->size += size; } if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
Re: [PATCH v3] Marvell phy: add fiber status check and configuration for some phys
On Wed, Jul 13, 2016 at 11:14:21AM +0200, Charles-Antoine Couret wrote: Hi Charles-Antoine > >> +#define LPA_FIBER_1000HALF0x40 > >> +#define LPA_FIBER_1000FULL0x20 > >> + > >> +#define LPA_PAUSE_FIBER 0x180 > >> +#define LPA_PAUSE_ASYM_FIBER 0x100 > >> + > >> +#define ADVERTISE_FIBER_1000HALF 0x40 > >> +#define ADVERTISE_FIBER_1000FULL 0x20 > >> + > >> +#define ADVERTISE_PAUSE_FIBER 0x180 > >> +#define ADVERTISE_PAUSE_ASYM_FIBER0x100 > > > > Are these standardised anywhere? If they are following a standard, > > they should be put into include/uapi/linux/mii.h. > I don't find any standard about this, I think it should be Marvell specific. O.K. > >> +static inline u32 ethtool_adv_to_fiber_adv_t(u32 ethadv) > >> +{ > >> + u32 result = 0; > >> + > >> + if (ethadv & ADVERTISED_1000baseT_Half) > >> + result |= ADVERTISE_FIBER_1000HALF; > > > > Dumb question: Does 1000baseT_Half even make sense for fibre? Can you > > do half duplex? Would that not mean you have a single fibre, both > > ends are using the same laser frequency, and you are doing some form > > of CSMA/CD? > > It's strange, I agree, but the register about that exists in the datasheet > and the value is not fixed. > In practice, I don't have a component to test this case correctly. O.K, just implement it according to the data sheet. > >> * > >> * Generic status code does not detect Fiber correctly! > >> @@ -906,12 +1070,17 @@ static int marvell_read_status(struct phy_device > >> *phydev) > >>int lpa; > >>int lpagb; > >>int status = 0; > >> + int page, fiber; > >> > >> - /* Update the link, but return if there > >> + /* Detect and update the link, but return if there > >> * was an error */ > >> - err = genphy_update_link(phydev); > >> - if (err) > >> - return err; > >> + page = phy_read(phydev, MII_MARVELL_PHY_PAGE); > >> + if (page == MII_M_FIBER) > >> + fiber = 1; > >> + else > >> + fiber = 0; > > > > This read is expensive, since the MDIO bus is slow. It would be better > > just to pass fibre as a parameter. > > But this function is used for other Marvell's phy, without fiber link for > example. > And this function should has only the struct phy_device as parameter. > > I don't have idea to avoid that, without create a custom function for that > which would be very similar to this function. > Or used a phy_device field for that? I think it's awful idea... So i would have static int marvell_read_status_page(struct phy_device *phydev, int page) {} basically doing what you have above, but without the read. static int marvell_read_status(struct phy_device *phydev) { if (phydev->supported & SUPPORTED_FIBRE) { marvell_read_status_page(phydev, MII_M_FIBER); if (phydev->link) return; return marvell_read_status_page(phydev, MII_M_COPPER); } > > I think it would be better to look for SUPPORTED_FIBRE in > > drv->features, rather than have two different functions. > > > > In fact, i would do that in general, rather than add your _fibre() > > functions. > > So, you suggest to do that in genphy_* functions or create marvell_* > functions with this condition? > I'm agree with the second suggestion. The second. > > >> + > >> +/* marvell_resume_fiber > >> + * > >> + * Some Marvell's phys have two modes: fiber and copper. > >> + * Both need to be resumed > >> + */ > >> +static int marvell_resume_fiber(struct phy_device *phydev) > >> +{ > >> + int err; > >> + > >> + /* Resume the fiber mode first */ > >> + err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M_FIBER); > >> + if (err < 0) > >> + goto error; > >> + > >> + err = genphy_resume(phydev); > >> + if (err < 0) > >> + goto error; > >> + > >> + /* Then, the copper link */ > >> + err = phy_write(phydev, MII_MARVELL_PHY_PAGE, MII_M_COPPER); > >> + if (err < 0) > >> + goto error; > >> + > >> + return genphy_resume(phydev); > > > > Should it be resumed twice? Or just once at the end? Same question > > for suspend. > > I don't understand your question. You call genphy_resume(phydev) twice. Once is sufficient. Andrew
Re: [PATCH v2 6/6] dt-bindings: net: bgmac: add bindings documentation for bgmac
On Thu, Jul 07, 2016 at 07:08:58PM -0400, Jon Mason wrote: > Signed-off-by: Jon Mason > --- > .../devicetree/bindings/net/brcm,amac.txt | 24 > ++ > .../devicetree/bindings/net/brcm,bgmac-nsp.txt | 24 > ++ > 2 files changed, 48 insertions(+) > create mode 100644 Documentation/devicetree/bindings/net/brcm,amac.txt > create mode 100644 Documentation/devicetree/bindings/net/brcm,bgmac-nsp.txt Acked-by: Rob Herring
Re: [PATCH 1/4] mac80211: mesh: flush stations before beacons are stopped
On Wed, Jul 13, 2016 at 10:11:25AM +, Machani, Yaniv wrote: > > > Some drivers (e.g. wl18xx) expect that the last stage in the > > > de-initialization process will be stopping the beacons, similar to ap. > > > Update ieee80211_stop_mesh() flow accordingly. > > > > > How well have you tested that with other drivers? > > > > Sorry for the delayed response (I've been out) and thanks for your comments, > I have tested it with RT3572 as well, and didn't see any issue. > I'll update the comment to reflect that. I'll give this a test on ath10k and wcn36xx as they are the ones most likely to care about ordering. -- Bob Copeland %% http://bobcopeland.com/
Re: [patch v2] net/mlx5: missing error code in esw_create_offloads_fdb_table()
On 13/07/2016 16:04, Leon Romanovsky wrote: On Wed, Jul 13, 2016 at 02:48:44PM +0300, Dan Carpenter wrote: We accidentally return success when we had intended to return an error code. Fixes: 69697b6e2086 ('net/mlx5: E-Switch, Add support for the sriov offloads mode') Signed-off-by: Dan Carpenter --- v2: return -ENOTSUPP instead --EINVAL I'm a little bit confused. Why did you prefer ENOTSUPP over EOPNOTSUPP? According to [1], it fits our case better - operation is valid and make sense, but isn't supported. [1] https://lists.gnu.org/archive/html/bug-glibc/2002-08/msg00017.html Thanks.
Re: [PATCH v5 10/11] Documentation: dtb: xgene: Add MDIO node
On Thu, Jul 07, 2016 at 04:02:58PM -0700, Iyappan Subramanian wrote: > Signed-off-by: Iyappan Subramanian > Tested-by: Fushen Chen > Tested-by: Toan Le > Tested-by: Matthias Brugger > --- > .../devicetree/bindings/net/apm-xgene-mdio.txt | 37 > ++ > 1 file changed, 37 insertions(+) > create mode 100644 Documentation/devicetree/bindings/net/apm-xgene-mdio.txt > > diff --git a/Documentation/devicetree/bindings/net/apm-xgene-mdio.txt > b/Documentation/devicetree/bindings/net/apm-xgene-mdio.txt > new file mode 100644 > index 000..0247e70 > --- /dev/null > +++ b/Documentation/devicetree/bindings/net/apm-xgene-mdio.txt > @@ -0,0 +1,37 @@ > +APM X-Gene SoC MDIO node > + > +MDIO node is defined to describe on-chip MDIO controller. > + > +Required properties: > + - compatible: Must be "apm,xgene-mdio-rgmii" > + - #address-cells: Must be <1>. > + - #size-cells: Must be <0>. > + - reg: Address and length of the register set > + - clocks: Reference to the clock entry > + > +For the phys on the mdio bus, there must be a node with the following fields: > + - compatible: PHY identifier. Please refer ./phy.txt for the format. > + - reg: The ID number for the phy. > + > +Example: > + > + mdio: mdio@0x1702 { Drop the '0x' With that, Acked-by: Rob Herring > + compatible = "apm,xgene-mdio-rgmii"; > + #address-cells = <1>; > + #size-cells = <0>; > + reg = <0x0 0x1702 0x0 0xd100>; > + clocks = <&menetclk 0>; > + }; > + > + /* Board-specific peripheral configurations */ > + &mdio { > + menetphy: phy@3 { > + reg = <0x3>; > + }; > + sgenet0phy: phy@4 { > + reg = <0x4>; > + }; > + sgenet1phy: phy@5 { > + reg = <0x5>; > + }; > + }; > -- > 1.9.1 > > -- > To unsubscribe from this list: send the line "unsubscribe devicetree" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next 1/3] perf, events: add non-linear data support for raw records
On 07/13/2016 02:10 PM, Peter Zijlstra wrote: On Wed, Jul 13, 2016 at 11:24:13AM +0200, Daniel Borkmann wrote: On 07/13/2016 09:52 AM, Peter Zijlstra wrote: On Wed, Jul 13, 2016 at 12:36:17AM +0200, Daniel Borkmann wrote: This patch adds support for non-linear data on raw records. It means that for such data, the newly introduced __output_custom() helper will be used instead of __output_copy(). __output_custom() will invoke whatever custom callback is passed in via struct perf_raw_record_frag to extract the data into the ring buffer slot. To keep changes in perf_prepare_sample() and in perf_output_sample() minimal, size/size_head split was added to perf_raw_record that call sites fill out, so that two extra tests in fast-path can be avoided. The few users of raw records are adapted to initialize their size_head and frag data; no change in behavior for them. Later patch will extend BPF side with a first user and callback for this facility, future users could be things like XDP BPF programs (that work on different context though and would thus have a different callback), etc. Why? What problem are we solving? I've tried to summarize it in patch 3/3, Which is pretty useless if you're staring at this patch. This currently has 3 issues we'd like to resolve: i) We need two copies instead of just a single one for the skb data. The data can be non-linear, see also skb_copy_bits() as an example for walking/extracting it, I'm not familiar enough with the network gunk to be able to read that. But upto skb_walk_frags() it looks entirely linear to me. Hm, fair enough, there are three parts, skb can have a linear part which is taken via skb->data, either in its entirety or there can be a non-linear part appended to that which can consist of pages that are in shared info section (skb_shinfo(skb) -> frags[], nr_frags members), that will be linearized, and in addition to that, appended after the frags[] data there can be further skbs to the 'root' skb that contain fragmented data, which is all what skb_copy_bits() copies linearized into 'to' buffer. So depending on the origin of the skb, its structure can be quite different and skb_copy_bits() covers all the cases generically. Maybe [1] summarizes it better if you want to familiarize yourself with how skbs work, although some parts are not up to date anymore. [1] http://vger.kernel.org/~davem/skb_data.html ii) for static verification reasons, the bpf_skb_load_bytes() helper needs to see a constant size on the passed buffer to make sure BPF verifier can do its sanity checks on it during verification time, so just passing in skb->len (or any other non-constant value) wouldn't work, but changing bpf_skb_load_bytes() is also not the real solution since we still have two copies we'd like to avoid as well, and iii) bpf_skb_load_bytes() is just for rather smaller buffers (e.g. headers) since they need to sit on the limited eBPF stack anyway. The set would improve the BPF helper to address all 3 at once. Humm, maybe. Lemme go try and reverse engineer that patch, because I'm not at all sure wth it's supposed to do, nor am I entirely sure this clarified things :/
[PATCH -next] stmmac: dwmac-socfpga: remove redundant dev_err call in socfpga_dwmac_parse_data()
From: Wei Yongjun There is a error message within devm_ioremap_resource already, so remove the dev_err call to avoid redundant error message. Signed-off-by: Wei Yongjun --- drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c | 17 +++-- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c index 3bc1fa2..edd20c3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c @@ -165,12 +165,8 @@ static int socfpga_dwmac_parse_data(struct socfpga_dwmac *dwmac, struct device * dwmac->splitter_base = devm_ioremap_resource(dev, &res_splitter); - if (IS_ERR(dwmac->splitter_base)) { - dev_err(dev, - "%s: ERROR: failed mapping emac splitter\n", - __func__); + if (IS_ERR(dwmac->splitter_base)) return PTR_ERR(dwmac->splitter_base); - } } index = of_property_match_string(np_sgmii_adapter, "reg-names", @@ -188,11 +184,8 @@ static int socfpga_dwmac_parse_data(struct socfpga_dwmac *dwmac, struct device * dwmac->pcs.sgmii_adapter_base = devm_ioremap_resource(dev, &res_sgmii_adapter); - if (IS_ERR(dwmac->pcs.sgmii_adapter_base)) { - dev_err(dev, "%s: failed to mapping adapter\n", - __func__); + if (IS_ERR(dwmac->pcs.sgmii_adapter_base)) return PTR_ERR(dwmac->pcs.sgmii_adapter_base); - } } index = of_property_match_string(np_sgmii_adapter, "reg-names", @@ -210,12 +203,8 @@ static int socfpga_dwmac_parse_data(struct socfpga_dwmac *dwmac, struct device * dwmac->pcs.tse_pcs_base = devm_ioremap_resource(dev, &res_tse_pcs); - if (IS_ERR(dwmac->pcs.tse_pcs_base)) { - dev_err(dev, - "%s: ERROR: failed mapping tse control port\n", - __func__); + if (IS_ERR(dwmac->pcs.tse_pcs_base)) return PTR_ERR(dwmac->pcs.tse_pcs_base); - } } } dwmac->reg_offset = reg_offset;
[PATCH -next] net: ethernet: bgmac: Remove redundant dev_err call in bgmac_probe()
From: Wei Yongjun There is a error message within devm_ioremap_resource already, so remove the dev_err call to avoid redundant error message. Signed-off-by: Wei Yongjun --- drivers/net/ethernet/broadcom/bgmac-platform.c | 8 ++-- 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bgmac-platform.c b/drivers/net/ethernet/broadcom/bgmac-platform.c index 1a2d841..be52f27 100644 --- a/drivers/net/ethernet/broadcom/bgmac-platform.c +++ b/drivers/net/ethernet/broadcom/bgmac-platform.c @@ -129,10 +129,8 @@ static int bgmac_probe(struct platform_device *pdev) } bgmac->plat.base = devm_ioremap_resource(&pdev->dev, regs); - if (IS_ERR(bgmac->plat.base)) { - dev_err(&pdev->dev, "Unable to map base resource\n"); + if (IS_ERR(bgmac->plat.base)) return PTR_ERR(bgmac->plat.base); - } regs = platform_get_resource_byname(pdev, IORESOURCE_MEM, "idm_base"); if (!regs) { @@ -141,10 +139,8 @@ static int bgmac_probe(struct platform_device *pdev) } bgmac->plat.idm_base = devm_ioremap_resource(&pdev->dev, regs); - if (IS_ERR(bgmac->plat.idm_base)) { - dev_err(&pdev->dev, "Unable to map idm resource\n"); + if (IS_ERR(bgmac->plat.idm_base)) return PTR_ERR(bgmac->plat.idm_base); - } bgmac->read = platform_bgmac_read; bgmac->write = platform_bgmac_write;