On Wed, 30 Nov 2016 11:30:00 -0800
Eric Dumazet <eric.duma...@gmail.com> wrote:

> On Wed, 2016-11-30 at 20:17 +0100, Jesper Dangaard Brouer wrote:
> 
> > Don't take is as critique Eric.  I was hoping your patch would have
> > solved this issue of being sensitive to TX completion adjustments.  You
> > usually have good solutions for difficult issues. I basically rejected
> > Achiad's approach/patch because it was too sensitive to these kind of
> > adjustments.  
> 
> Well, this patch can hurt latencies, because a doorbell can be delayed,
> and softirqs can be delayed by many hundred of usec in some cases.
> 
> I would not enable this behavior by default.

What about another scheme, where dev_hard_start_xmit() can return an
indication that driver choose not to flush (based on TX queue depth),
and there by requesting stack to call flush at a later point.

Would that introduce less latency issues?


Patch muckup (not even compile tested):

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4ffcd874cc20..d7d15e4e6766 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -109,6 +109,7 @@ enum netdev_tx {
        __NETDEV_TX_MIN  = INT_MIN,     /* make sure enum is signed */
        NETDEV_TX_OK     = 0x00,        /* driver took care of packet */
        NETDEV_TX_BUSY   = 0x10,        /* driver tx path was busy*/
+       NETDEV_TX_FLUSHME= 0x04,        /* driver request doorbell/flush later 
*/
 };
 typedef enum netdev_tx netdev_tx_t;
 
@@ -536,6 +537,8 @@ enum netdev_queue_state_t {
        __QUEUE_STATE_DRV_XOFF,
        __QUEUE_STATE_STACK_XOFF,
        __QUEUE_STATE_FROZEN,
+       // __QUEUE_STATE_NEED_FLUSH
+       // is is better to store in txq state?
 };
 
 #define QUEUE_STATE_DRV_XOFF   (1 << __QUEUE_STATE_DRV_XOFF)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index e6aa0a249672..7480e44c5a50 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -75,6 +75,7 @@ struct Qdisc {
        void                    *u32_node;
 
        struct netdev_queue     *dev_queue;
+       struct netdev_queue     *flush_dev_queue; // store txq to flush here?
 
        struct gnet_stats_rate_est64    rate_est;
        struct gnet_stats_basic_cpu __percpu *cpu_bstats;
@@ -98,6 +99,20 @@ struct Qdisc {
        spinlock_t              busylock ____cacheline_aligned_in_smp;
 };
 
+static inline void qdisc_request_txq_flush(struct Qdisc *qdisc,
+                                          struct netdev_queue *txq)
+{
+       struct net_device dev;
+
+       if (qdisc->flush_dev_queue) {
+               if (likely(qdisc->flush_dev_queue == txq))
+                       return;
+               /* Flush existing txq before reassignment */
+               dev_flush_xmit(qdisc_dev(q), txq);
+       }
+       qdisc->flush_dev_queue = txq;
+}
+
 static inline bool qdisc_is_running(const struct Qdisc *qdisc)
 {
        return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
@@ -117,6 +132,19 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
 
 static inline void qdisc_run_end(struct Qdisc *qdisc)
 {
+       /* flush device txq here, if needed */
+       if (qdisc->flush_dev_queue) {
+               struct netdev_queue *txq = qdisc->flush_dev_queue;
+               struct net_device *dev = qdisc_dev(q);
+
+               qdisc->flush_dev_queue = NULL;
+               dev_flush_xmit(dev, txq);
+               /*
+                * DISCUSS: it is too soon to flush here? What about
+                * rescheduling a NAPI poll cycle for this device,
+                * before calling flush.
+                */
+       }
        write_seqcount_end(&qdisc->running);
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 048b46b7c92a..70339c267f33 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2880,6 +2880,15 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_skb_features);
 
+static int dev_flush_xmit(struct net_device *dev,
+                         struct netdev_queue *txq)
+{
+       const struct net_device_ops *ops = dev->netdev_ops;
+
+       ops->ndo_flush_xmit(dev, txq);
+       // Oh oh, do we need to take HARD_TX_LOCK ??
+}
+
 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
                    struct netdev_queue *txq, bool more)
 {
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 6cfb6e9038c2..55c01b6f6311 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -190,6 +190,13 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
 
        if (dev_xmit_complete(ret)) {
                /* Driver sent out skb successfully or skb was consumed */
+               if (ret == NETDEV_TX_FLUSHME) {
+                       /* Driver choose no-TX-doorbell MMIO write.
+                        * This made taking qdisc root_lock less expensive.
+                        */
+                       qdisc_request_txq_flush(q, txq);
+                       // Flush happens later in qdisc_run_end()
+               }
                ret = qdisc_qlen(q);
        } else {
                /* Driver returned NETDEV_TX_BUSY - requeue skb */


-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

Reply via email to