On 11/18/22 11:40, Dheeraj Kumar via dev wrote:
> Problem Statement:
> Before OVS 2.12 the OVS-DPDK datapath transmitted processed rx packet batches
> directly to the wanted tx queues. In OVS 2.12 each PMD stores the processed
> packets in an intermediate buffer per output port and flushes these output
> buffers in a separate step. This buffering was introduced to allow better
> batching of packets for transmit.
> 
> The current implementation of the function that flushes the output buffers
> performs a full scan overall output ports, even if only one single packet
> was buffered for a single output port. In systems with hundreds of ports
> this can take a long time and degrades OVS-DPDK performance significantly.
> 
> Solution:
> Maintain a list of output ports with buffered packets for each PMD thread and
> only iterate over that list when flushing output buffers.
> 
> Signed-off-by: Dheeraj Kumar <dheera...@acldigital.com>

Hi, Dheeraj.  Thanks for the follow up and sorry for delay.

Please, add the version number to the subject prefix while sending
new versions of patches, e.g. '[PATCH v2]'.

I didn't test it, but the patch looks good to me in general,
see some minor comments inline.

Best regards, Ilya Maximets.

> ---
>  lib/dpif-netdev-private-thread.h |  7 ++++---
>  lib/dpif-netdev.c                | 23 +++++++++++------------
>  2 files changed, 15 insertions(+), 15 deletions(-)
> 
> diff --git a/lib/dpif-netdev-private-thread.h 
> b/lib/dpif-netdev-private-thread.h
> index 4472b199d..2775e1a2b 100644
> --- a/lib/dpif-netdev-private-thread.h
> +++ b/lib/dpif-netdev-private-thread.h
> @@ -185,9 +185,6 @@ struct dp_netdev_pmd_thread {
>       * than 'cmap_count(dp->poll_threads)'. */
>      uint32_t static_tx_qid;
>  
> -    /* Number of filled output batches. */
> -    int n_output_batches;
> -
>      struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. 
> */
>      /* List of rx queues to poll. */
>      struct hmap poll_list OVS_GUARDED;
> @@ -213,6 +210,10 @@ struct dp_netdev_pmd_thread {
>      struct hmap tnl_port_cache;
>      struct hmap send_port_cache;
>  
> +    /* Keep track of the ports with buffered output packets in
> +     * send_port_cache. */
> +    struct ovs_list pending_tx_ports;
> +
>      /* Keep track of detailed PMD performance statistics. */
>      struct pmd_perf_stats perf_stats;
>  
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index a45b46014..af0e0cc5d 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -500,6 +500,7 @@ struct tx_port {
>      int qid;
>      long long last_used;
>      struct hmap_node node;
> +    struct ovs_list pending_tx;     /* Only used in send_port_cache. */

The comment seems incorrect.  It should be 'pending_tx_ports'
instead of 'send_port_cache'.  It's also better to specify
the structure that will reference this node and add a 'node'
to the name to highlight that it's not a standalone list.  E.g.:

    struct ovs_list pending_tx_node;  /* In dp_netdev_pmd_thread's
                                       * 'pending_tx_ports'. */

>      long long flush_time;
>      struct dp_packet_batch output_pkts;
>      struct dp_packet_batch *txq_pkts; /* Only for hash mode. */
> @@ -5241,8 +5242,9 @@ dp_netdev_pmd_flush_output_on_port(struct 
> dp_netdev_pmd_thread *pmd,
>      atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
>      p->flush_time = pmd->ctx.now + tx_flush_interval;
>  
> -    ovs_assert(pmd->n_output_batches > 0);
> -    pmd->n_output_batches--;
> +    /* Remove send port from pending port list */
> +    ovs_assert(!ovs_list_is_empty(&p->pending_tx));
> +    ovs_list_remove(&p->pending_tx);

We should re-init the node here.

>  
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, 
> output_cnt);
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
> @@ -5264,16 +5266,11 @@ static int
>  dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
>                                     bool force)
>  {
> -    struct tx_port *p;
> +    struct tx_port *p, *next;

SAFE iterators don't need a 'next' pointer these days, you should
be able to just drop it.

>      int output_cnt = 0;
>  
> -    if (!pmd->n_output_batches) {
> -        return 0;
> -    }
> -
> -    HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
> -        if (!dp_packet_batch_is_empty(&p->output_pkts)
> -            && (force || pmd->ctx.now >= p->flush_time)) {
> +    LIST_FOR_EACH_SAFE (p, next, pending_tx, &pmd->pending_tx_ports) {
> +        if (force || pmd->ctx.now >= p->flush_time) {
>              output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
>          }
>      }
> @@ -6803,6 +6800,7 @@ pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
>                                            n_txq * sizeof *tx_port->txq_pkts);
>                  tx_port_cached->txq_pkts = txq_pkts_cached;
>              }
> +            ovs_list_init(&tx_port_cached->pending_tx);
>              hmap_insert(&pmd->send_port_cache, &tx_port_cached->node,
>                          hash_port_no(tx_port_cached->port->port_no));
>          }
> @@ -7447,7 +7445,6 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread 
> *pmd, struct dp_netdev *dp,
>      pmd->core_id = core_id;
>      pmd->numa_id = numa_id;
>      pmd->need_reload = false;
> -    pmd->n_output_batches = 0;
>  
>      ovs_refcount_init(&pmd->ref_cnt);
>      atomic_init(&pmd->exit, false);
> @@ -7474,6 +7471,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread 
> *pmd, struct dp_netdev *dp,
>      hmap_init(&pmd->tnl_port_cache);
>      hmap_init(&pmd->send_port_cache);
>      cmap_init(&pmd->tx_bonds);
> +    ovs_list_init(&pmd->pending_tx_ports);
>  
>      /* Initialize DPIF function pointer to the default configured version. */
>      atomic_init(&pmd->netdev_input_func, dp_netdev_impl_get_default());
> @@ -7498,6 +7496,7 @@ dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
>      struct dpcls *cls;
>  
>      dp_netdev_pmd_flow_flush(pmd);
> +    ovs_list_poison(&pmd->pending_tx_ports);

Not sure if it makes sense to poison the list here.

>      hmap_destroy(&pmd->send_port_cache);
>      hmap_destroy(&pmd->tnl_port_cache);
>      hmap_destroy(&pmd->tx_ports);
> @@ -8714,7 +8713,7 @@ dp_execute_output_action(struct dp_netdev_pmd_thread 
> *pmd,
>          dp_netdev_pmd_flush_output_on_port(pmd, p);
>      }
>      if (dp_packet_batch_is_empty(&p->output_pkts)) {

We didn't have a direct way to check before, so we used the
number of buffered packets as an indicator that this port
is already counted.  Now we can and probably should just
check for the list node being empty.  This will work if we
will re-initialize it after the flush.

> -        pmd->n_output_batches++;
> +        ovs_list_push_front(&pmd->pending_tx_ports, &p->pending_tx);
>      }
>  
>      struct dp_packet *packet;

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to