Acked-by: Billy O'Mahony <[email protected]>
> -----Original Message-----
> From: [email protected] [mailto:ovs-dev-
> [email protected]] On Behalf Of Jan Scheurich
> Sent: Friday, January 12, 2018 12:39 AM
> To: [email protected]
> Cc: [email protected]
> Subject: [ovs-dev] [PATCH v9 2/2] dpif-netdev: Refactor cycle counting
>
> Simplify the historically grown TSC cycle counting in PMD threads.
> Cycles are currently counted for the following purposes:
>
> 1. Measure PMD ustilization
>
> PMD utilization is defined as ratio of cycles spent in busy iterations (at
> least one
> packet received or sent) over the total number of cycles.
>
> This is already done in pmd_perf_start_iteration() and
> pmd_perf_end_iteration() based on a TSC timestamp saved in current iteration
> at start_iteration() and the actual TSC at end_iteration().
> No dependency on intermediate cycle accounting.
>
> 2. Measure the processing load per RX queue
>
> This comprises cycles spend on polling and processing packets received from
> the
> rx queue and the cycles spent on delayed sending of these packets to tx queues
> (with time-based batching).
>
> The previous scheme using cycles_count_start(), cycles_count_intermediate()
> and cycles-count_end() originally introduced to simplify cycle counting and
> saving calls to rte_get_tsc_cycles() was rather obscuring things.
>
> Replace by a nestable cycle_timer with with start and stop functions to
> embrace
> a code segment to be timed. The timed code may contain arbitrary nested
> cycle_timers. The duration of nested timers is excluded from the outer timer.
>
> The caller must ensure that each call to cycle_timer_start() is followed by a
> call
> to cycle_timer_end(). Failure to do so will lead to assertion failure or a
> memory
> leak.
>
> The new cycle_timer is used to measure the processing cycles per rx queue.
> This is not yet strictly necessary but will be made use of in a subsequent
> commit.
>
> All cycle count functions and data are relocated to module dpif-netdev-perf.
>
> Signed-off-by: Jan Scheurich <[email protected]>
> ---
> lib/dpif-netdev-perf.h | 110 ++++++++++++++++++++++++++++++++++++++++---
> -
> lib/dpif-netdev.c | 122
> ++++++++++++++-----------------------------------
> 2 files changed, 135 insertions(+), 97 deletions(-)
>
> diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h index
> 53d60d3..5993c25 100644
> --- a/lib/dpif-netdev-perf.h
> +++ b/lib/dpif-netdev-perf.h
> @@ -23,6 +23,11 @@
> #include <string.h>
> #include <math.h>
>
> +#ifdef DPDK_NETDEV
> +#include <rte_config.h>
> +#include <rte_cycles.h>
> +#endif
> +
> #include "openvswitch/vlog.h"
> #include "ovs-atomic.h"
> #include "timeval.h"
> @@ -59,10 +64,6 @@ enum pmd_stat_type {
> * recirculation. */
> PMD_STAT_SENT_PKTS, /* Packets that have been sent. */
> PMD_STAT_SENT_BATCHES, /* Number of batches sent. */
> - PMD_CYCLES_POLL_IDLE, /* Cycles spent unsuccessful polling. */
> - PMD_CYCLES_POLL_BUSY, /* Cycles spent successfully polling and
> - * processing polled packets. */
> - PMD_CYCLES_OVERHEAD, /* Cycles spent for other tasks. */
> PMD_CYCLES_ITER_IDLE, /* Cycles spent in idle iterations. */
> PMD_CYCLES_ITER_BUSY, /* Cycles spent in busy iterations. */
> PMD_N_STATS
> @@ -85,11 +86,95 @@ struct pmd_counters {
>
> struct pmd_perf_stats {
> /* Start of the current PMD iteration in TSC cycles.*/
> + uint64_t start_it_tsc;
> + /* Latest TSC time stamp taken in PMD. */
> uint64_t last_tsc;
> + /* If non-NULL, outermost cycle timer currently running in PMD. */
> + struct cycle_timer *cur_timer;
> /* Set of PMD counters with their zero offsets. */
> struct pmd_counters counters;
> };
>
> +/* Support for accurate timing of PMD execution on TSC clock cycle level.
> + * These functions are intended to be invoked in the context of pmd
> +threads. */
> +
> +/* Read the TSC cycle register and cache it. Any function not requiring
> +clock
> + * cycle accuracy should read the cached value using
> +cycles_counter_get() to
> + * avoid the overhead of reading the TSC register. */
> +
> +static inline uint64_t
> +cycles_counter_update(struct pmd_perf_stats *s) { #ifdef DPDK_NETDEV
> + return s->last_tsc = rte_get_tsc_cycles(); #else
> + return s->last_tsc = 0;
> +#endif
> +}
> +
> +static inline uint64_t
> +cycles_counter_get(struct pmd_perf_stats *s) {
> + return s->last_tsc;
> +}
> +
> +/* A nestable timer for measuring execution time in TSC cycles.
> + *
> + * Usage:
> + * struct cycle_timer timer;
> + *
> + * cycle_timer_start(pmd, &timer);
> + * <Timed execution>
> + * uint64_t cycles = cycle_timer_stop(pmd, &timer);
> + *
> + * The caller must guarantee that a call to cycle_timer_start() is
> +always
> + * paired with a call to cycle_stimer_stop().
> + *
> + * Is is possible to have nested cycles timers within the timed code.
> +The
> + * execution time measured by the nested timers is excluded from the
> +time
> + * measured by the embracing timer.
> + */
> +
> +struct cycle_timer {
> + uint64_t start;
> + uint64_t suspended;
> + struct cycle_timer *interrupted;
> +};
> +
> +static inline void
> +cycle_timer_start(struct pmd_perf_stats *s,
> + struct cycle_timer *timer) {
> + struct cycle_timer *cur_timer = s->cur_timer;
> + uint64_t now = cycles_counter_update(s);
> +
> + if (cur_timer) {
> + cur_timer->suspended = now;
> + }
> + timer->interrupted = cur_timer;
> + timer->start = now;
> + timer->suspended = 0;
> + s->cur_timer = timer;
> +}
> +
> +static inline uint64_t
> +cycle_timer_stop(struct pmd_perf_stats *s,
> + struct cycle_timer *timer) {
> + /* Assert that this is the current cycle timer. */
> + ovs_assert(s->cur_timer == timer);
> + uint64_t now = cycles_counter_update(s);
> + struct cycle_timer *intr_timer = timer->interrupted;
> +
> + if (intr_timer) {
> + /* Adjust the start offset by the suspended cycles. */
> + intr_timer->start += now - intr_timer->suspended;
> + }
> + /* Restore suspended timer, if any. */
> + s->cur_timer = intr_timer;
> + return now - timer->start;
> +}
> +
> void pmd_perf_stats_init(struct pmd_perf_stats *s); void
> pmd_perf_stats_clear(struct pmd_perf_stats *s); void
> pmd_perf_read_counters(struct pmd_perf_stats *s, @@ -115,16 +200,23 @@
> pmd_perf_update_counter(struct pmd_perf_stats *s, }
>
> static inline void
> -pmd_perf_start_iteration(struct pmd_perf_stats *s, uint64_t now_tsc)
> +pmd_perf_start_iteration(struct pmd_perf_stats *s)
> {
> - s->last_tsc = now_tsc;
> + if (OVS_LIKELY(s->last_tsc)) {
> + /* We assume here that last_tsc was updated immediately prior at
> + * the end of the previous iteration, or just before the first
> + * iteration. */
> + s->start_it_tsc = s->last_tsc;
> + } else {
> + /* In case last_tsc has never been set before. */
> + s->start_it_tsc = cycles_counter_update(s);
> + }
> }
>
> static inline void
> -pmd_perf_end_iteration(struct pmd_perf_stats *s, uint64_t now_tsc,
> - int rx_packets)
> +pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets)
> {
> - uint64_t cycles = now_tsc - s->last_tsc;
> + uint64_t cycles = cycles_counter_update(s) - s->start_it_tsc;
>
> if (rx_packets > 0) {
> pmd_perf_update_counter(s, PMD_CYCLES_ITER_BUSY, cycles); diff --git
> a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 82d29bb..e371d11 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -32,10 +32,6 @@
> #include <sys/stat.h>
> #include <unistd.h>
>
> -#ifdef DPDK_NETDEV
> -#include <rte_cycles.h>
> -#endif
> -
> #include "bitmap.h"
> #include "cmap.h"
> #include "conntrack.h"
> @@ -509,8 +505,6 @@ struct tx_port {
> struct dp_netdev_pmd_thread_ctx {
> /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
> long long now;
> - /* Used to count cycles. See 'cycles_count_end()' */
> - unsigned long long last_cycles;
> };
>
> /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate @@
> -3111,64 +3105,20 @@ dp_netdev_actions_free(struct dp_netdev_actions
> *actions)
> free(actions);
> }
>
> -static inline unsigned long long
> -cycles_counter(void)
> -{
> -#ifdef DPDK_NETDEV
> - return rte_get_tsc_cycles();
> -#else
> - return 0;
> -#endif
> -}
> -
> -/* Fake mutex to make sure that the calls to cycles_count_* are balanced */ -
> extern struct ovs_mutex cycles_counter_fake_mutex;
> -
> -/* Start counting cycles. Must be followed by 'cycles_count_end()' */
> -static
> inline void -cycles_count_start(struct dp_netdev_pmd_thread *pmd)
> - OVS_ACQUIRES(&cycles_counter_fake_mutex)
> - OVS_NO_THREAD_SAFETY_ANALYSIS
> -{
> - pmd->ctx.last_cycles = cycles_counter();
> -}
> -
> -/* Stop counting cycles and add them to the counter 'type' */ -static inline
> void
> -cycles_count_end(struct dp_netdev_pmd_thread *pmd,
> - enum pmd_stat_type type)
> - OVS_RELEASES(&cycles_counter_fake_mutex)
> - OVS_NO_THREAD_SAFETY_ANALYSIS
> -{
> - unsigned long long interval = cycles_counter() - pmd->ctx.last_cycles;
> -
> - pmd_perf_update_counter(&pmd->perf_stats, type, interval);
> -}
> -
> -/* Calculate the intermediate cycle result and add to the counter 'type' */ -
> static inline void -cycles_count_intermediate(struct dp_netdev_pmd_thread
> *pmd,
> - struct dp_netdev_rxq *rxq,
> - enum pmd_stat_type type)
> - OVS_NO_THREAD_SAFETY_ANALYSIS
> +static void
> +dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
> + enum rxq_cycles_counter_type type,
> + unsigned long long cycles)
> {
> - unsigned long long new_cycles = cycles_counter();
> - unsigned long long interval = new_cycles - pmd->ctx.last_cycles;
> - pmd->ctx.last_cycles = new_cycles;
> -
> - pmd_perf_update_counter(&pmd->perf_stats, type, interval);
> - if (rxq && (type == PMD_CYCLES_POLL_BUSY)) {
> - /* Add to the amount of current processing cycles. */
> - non_atomic_ullong_add(&rxq->cycles[RXQ_CYCLES_PROC_CURR],
> interval);
> - }
> + atomic_store_relaxed(&rx->cycles[type], cycles);
> }
>
> static void
> -dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
> +dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
> enum rxq_cycles_counter_type type,
> unsigned long long cycles) {
> - atomic_store_relaxed(&rx->cycles[type], cycles);
> + non_atomic_ullong_add(&rx->cycles[type], cycles);
> }
>
> static uint64_t
> @@ -3234,27 +3184,40 @@ dp_netdev_pmd_flush_output_packets(struct
> dp_netdev_pmd_thread *pmd)
>
> static int
> dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> - struct netdev_rxq *rx,
> + struct dp_netdev_rxq *rxq,
> odp_port_t port_no) {
> struct dp_packet_batch batch;
> + struct cycle_timer timer;
> int error;
> int batch_cnt = 0;
>
> + /* Measure duration for polling and processing rx burst. */
> + cycle_timer_start(&pmd->perf_stats, &timer);
> dp_packet_batch_init(&batch);
> - error = netdev_rxq_recv(rx, &batch);
> + error = netdev_rxq_recv(rxq->rx, &batch);
> if (!error) {
> + /* At least one packet received. */
> *recirc_depth_get() = 0;
> pmd_thread_ctx_time_update(pmd);
>
> batch_cnt = batch.count;
> dp_netdev_input(pmd, &batch, port_no);
> dp_netdev_pmd_flush_output_packets(pmd);
> - } else if (error != EAGAIN && error != EOPNOTSUPP) {
> - static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>
> - VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
> - netdev_rxq_get_name(rx), ovs_strerror(error));
> + /* Assign processing cycles to rx queue. */
> + uint64_t cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
> + dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
> +
> + } else {
> + /* Discard cycles. */
> + cycle_timer_stop(&pmd->perf_stats, &timer);
> + if (error != EAGAIN && error != EOPNOTSUPP) {
> + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1,
> + 5);
> +
> + VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
> + netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
> + }
> }
>
> return batch_cnt;
> @@ -3880,30 +3843,22 @@ dpif_netdev_run(struct dpif *dpif)
> struct dp_netdev *dp = get_dp_netdev(dpif);
> struct dp_netdev_pmd_thread *non_pmd;
> uint64_t new_tnl_seq;
> - int process_packets = 0;
>
> ovs_mutex_lock(&dp->port_mutex);
> non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
> if (non_pmd) {
> ovs_mutex_lock(&dp->non_pmd_mutex);
> - cycles_count_start(non_pmd);
> HMAP_FOR_EACH (port, node, &dp->ports) {
> if (!netdev_is_pmd(port->netdev)) {
> int i;
>
> for (i = 0; i < port->n_rxq; i++) {
> - process_packets =
> - dp_netdev_process_rxq_port(non_pmd,
> - port->rxqs[i].rx,
> - port->port_no);
> - cycles_count_intermediate(non_pmd, NULL,
> - process_packets
> - ? PMD_CYCLES_POLL_BUSY
> - : PMD_CYCLES_POLL_IDLE);
> + dp_netdev_process_rxq_port(non_pmd,
> + &port->rxqs[i],
> + port->port_no);
> }
> }
> }
> - cycles_count_end(non_pmd, PMD_CYCLES_POLL_IDLE);
> pmd_thread_ctx_time_update(non_pmd);
> dpif_netdev_xps_revalidate_pmd(non_pmd, false);
> ovs_mutex_unlock(&dp->non_pmd_mutex);
> @@ -4082,18 +4037,14 @@ reload:
> lc = UINT_MAX;
> }
>
> - cycles_count_start(pmd);
> + cycles_counter_update(s);
> for (;;) {
> uint64_t iter_packets = 0;
> - pmd_perf_start_iteration(s, pmd->ctx.last_cycles);
> + pmd_perf_start_iteration(s);
> for (i = 0; i < poll_cnt; i++) {
> process_packets =
> - dp_netdev_process_rxq_port(pmd, poll_list[i].rxq->rx,
> + dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
> poll_list[i].port_no);
> - cycles_count_intermediate(pmd, poll_list[i].rxq,
> - process_packets
> - ? PMD_CYCLES_POLL_BUSY
> - : PMD_CYCLES_POLL_IDLE);
> iter_packets += process_packets;
> }
>
> @@ -4115,13 +4066,10 @@ reload:
> if (reload) {
> break;
> }
> - cycles_count_intermediate(pmd, NULL, PMD_CYCLES_OVERHEAD);
> }
> - pmd_perf_end_iteration(s, pmd->ctx.last_cycles, iter_packets);
> + pmd_perf_end_iteration(s, iter_packets);
> }
>
> - cycles_count_end(pmd, PMD_CYCLES_OVERHEAD);
> -
> poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
> exiting = latch_is_set(&pmd->exit_latch);
> /* Signal here to make sure the pmd finishes @@ -5066,9 +5014,7 @@
> handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
> ovs_mutex_unlock(&pmd->flow_mutex);
> emc_probabilistic_insert(pmd, key, netdev_flow);
> }
> - /* Only error ENOSPC can reach here. We process the packet but do not
> - * install a datapath flow. Treat as successful. */
> - return 0;
> + return error;
> }
>
> static inline void
> --
> 1.9.1
>
> _______________________________________________
> dev mailing list
> [email protected]
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev