+Kevin, +Ian On 09.01.2018 11:53, Ilya Maximets wrote: Comments inline.
On 04.01.2018 15:07, Jan Scheurich wrote:> This patch instruments the dpif-netdev datapath to record detailed > statistics of what is happening in every iteration of a PMD thread. > > The collection of detailed statistics can be controlled by a new > configuration parameter "other_config:pmd-perf-metrics". By default it > is disabled. The run-time overhead, when enabled, is in the order of 1%. > > The covered metrics per iteration are: > - cycles > - packets > - (rx) batches > - packets/batch > - max. vhostuser qlen > - upcalls > - cycles spent in upcalls > > This raw recorded data is used threefold: > > 1. In histograms for each of the following metrics: > - cycles/iteration (log.) > - packets/iteration (log.) > - cycles/packet > - packets/batch > - max. vhostuser qlen (log.) > - upcalls > - cycles/upcall (log) > The histograms bins are divided linear or logarithmic. > > 2. A cyclic history of the above statistics for 1024 iterations > > 3. A cyclic history of the cummulative/average values per millisecond > wall clock for the last 1024 milliseconds: > - number of iterations > - avg. cycles/iteration > - packets (Kpps) > - avg. packets/batch > - avg. max vhost qlen > - upcalls > - avg. cycles/upcall > > The gathered performance statists can be printed at any time with the > new CLI command > > ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len] > [-pmd core] [dp] > > The options are > > -nh: Suppress the histograms > -it iter_len: Display the last iter_len iteration stats > -ms ms_len: Display the last ms_len millisecond stats > -pmd core: Display only > > The performance statistics are reset with the existing > dpif-netdev/pmd-stats-clear command. > > The output always contains the following global PMD statistics, > similar to the pmd-stats-show command: > > Time: 15:24:55.270 > Measurement duration: 1.008 s > > pmd thread numa_id 0 core_id 1: > > Cycles: 2419034712 (2.40 GHz) > Iterations: 572817 (1.76 us/it) > - idle: 486808 (15.9 % cycles) > - busy: 86009 (84.1 % cycles) > Packets: 2399607 (2381 Kpps, 848 cycles/pkt) > Datapath passes: 3599415 (1.50 passes/pkt) > - EMC hits: 336472 ( 9.3 %) > - Megaflow hits: 3262943 (90.7 %, 1.00 subtbl lookups/hit) > - Upcalls: 0 ( 0.0 %, 0.0 us/upcall) > - Lost upcalls: 0 ( 0.0 %) > > Signed-off-by: Jan Scheurich <jan.scheur...@ericsson.com> > --- > lib/dp-packet.h | 2 + > lib/dpif-netdev-perf.c | 309 > ++++++++++++++++++++++++++++++++++++++++++++++++- > lib/dpif-netdev-perf.h | 173 ++++++++++++++++++++++++++- > lib/dpif-netdev.c | 158 +++++++++++++++++++++++-- > lib/netdev-dpdk.c | 23 +++- > lib/netdev-dpdk.h | 14 +++ > ofproto/ofproto-dpif.c | 3 +- > 7 files changed, 664 insertions(+), 18 deletions(-) > > diff --git a/lib/dp-packet.h b/lib/dp-packet.h > index b4b721c..7950247 100644 > --- a/lib/dp-packet.h > +++ b/lib/dp-packet.h > @@ -695,8 +695,10 @@ enum { NETDEV_MAX_BURST = 32 }; /* Maximum number > packets in a batch. */ > > struct dp_packet_batch { > size_t count; > + size_t qfill; /* Number of packets in queue when reading rx burst. */ This change looks completely wrog for me. 'qfill' should not be here. This and all the related stuff in further code turns internatls of netdev-dpdk inside out and pushes them to the higher level. I don't like this. Also, currently there are at least 3 patch-sets in OVS mail list targeted to get some statisics and performance counters. Can we just use approach similar to "netdev: Custom statistics." ? (https://mail.openvswitch.org/pipermail/ovs-dev/2018-January/342724.html) We may collect drops or count high values of qfill using coverage counters or in some other way. 'qfill', IMHO, should not belong to generic dp_packet_batch structure. Butches are not for this. > bool trunc; /* true if the batch needs truncate. */ > struct dp_packet *packets[NETDEV_MAX_BURST]; > + Not needed empty line. > }; > > static inline void > diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c > index 7d8b7b2..a66a48c 100644 > --- a/lib/dpif-netdev-perf.c > +++ b/lib/dpif-netdev-perf.c > @@ -15,6 +15,7 @@ > */ > > #include <config.h> > +#include <stdint.h> > > #ifdef DPDK_NETDEV > #include <rte_cycles.h> > @@ -27,13 +28,307 @@ > > VLOG_DEFINE_THIS_MODULE(pmd_perf); > > +#ifdef DPDK_NETDEV > +static uint64_t > +get_tsc_hz(void) > +{ > + return rte_get_tsc_hz(); > +} > +#else > +static uint64_t > +read_tsc(void) > +{ > + register uint64_t tsc asm("eax"); > + asm volatile (".byte 15, 49" : : : "eax", "edx"); > + return tsc; > +} This breaks non-x86 build: lib/dpif-netdev-perf.c:58:31: error: unknown register name 'eax' in asm register uint64_t tsc asm("eax"); ^ lib/dpif-netdev-perf.c:59:40: error: unknown register name 'eax' in asm asm volatile (".byte 15, 49" : : : "eax", "edx"); > + > +static uint64_t > +get_tsc_hz(void) > +{ > + uint64_t tsc_start = read_tsc(); > + xsleep(1); > + return read_tsc() - tsc_start; > +} > +#endif > + > +static void > +histogram_walls_set_lin(struct histogram *hist, uint32_t min, uint32_t max) > +{ > + int i; > + > + ovs_assert(min < max); > + for (i = 0; i < NUM_BINS-1; i++) { > + hist->wall[i] = min + (i * (max - min)) / (NUM_BINS - 2); > + } > + hist->wall[NUM_BINS-1] = UINT32_MAX; > +} > + > +static void > +histogram_walls_set_log(struct histogram *hist, uint32_t min, uint32_t max) > +{ > + int i, start, bins, wall; > + double log_min, log_max; > + > + ovs_assert(min < max); > + if (min > 0) { > + log_min = log(min); > + log_max = log(max); > + start = 0; > + bins = NUM_BINS - 1; > + } else { > + hist->wall[0] = 0; > + log_min = log(1); > + log_max = log(max); > + start = 1; > + bins = NUM_BINS - 2; > + } > + wall = start; > + for (i = 0; i < bins; i++) { > + /* Make sure each wall is monotonically increasing. */ > + wall = MAX(wall, exp(log_min + (i * (log_max - log_min)) / > (bins-1))); > + hist->wall[start + i] = wall++; > + } > + if (hist->wall[NUM_BINS-2] < max) { > + hist->wall[NUM_BINS-2] = max; > + } > + hist->wall[NUM_BINS-1] = UINT32_MAX; > +} > + > +uint64_t > +histogram_samples(const struct histogram *hist) > +{ > + uint64_t samples = 0; > + > + for (int i = 0; i < NUM_BINS; i++) { > + samples += hist->bin[i]; > + } > + return samples; > +} > + > +static void > +histogram_clear(struct histogram *hist) > +{ > + int i; > + > + for (i = 0; i < NUM_BINS; i++) { > + hist->bin[i] = 0; > + } > +} > + > +static void > +history_init(struct history *h) > +{ > + memset(h, 0, sizeof(*h)); > +} > + > void > pmd_perf_stats_init(struct pmd_perf_stats *s) { > - memset(s, 0 , sizeof(*s)); > + /* The struct has been zeroed at allocation. */ > + histogram_walls_set_log(&s->cycles, 500, 24000000); > + histogram_walls_set_log(&s->pkts, 0, 1000); > + histogram_walls_set_lin(&s->cycles_per_pkt, 100, 30000); > + histogram_walls_set_lin(&s->pkts_per_batch, 0, 32); > + histogram_walls_set_lin(&s->upcalls, 0, 30); > + histogram_walls_set_log(&s->cycles_per_upcall, 1000, 1000000); > + histogram_walls_set_log(&s->max_vhost_qfill, 0, 512); > s->start_ms = time_msec(); > } > > void > +pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s, > + double duration) > +{ > + uint64_t stats[PMD_N_STATS]; > + double us_per_cycle = 1000000.0 / get_tsc_hz(); > + > + if (duration == 0) { > + return; > + } > + > + pmd_perf_read_counters(s, stats); > + uint64_t tot_cycles = stats[PMD_CYCLES_ITER_IDLE] + > + stats[PMD_CYCLES_ITER_BUSY]; > + uint64_t packets = stats[PMD_STAT_RECV]; > + uint64_t passes = stats[PMD_STAT_RECV] + > + stats[PMD_STAT_RECIRC]; > + uint64_t upcalls = stats[PMD_STAT_MISS]; > + uint64_t upcall_cycles = stats[PMD_CYCLES_UPCALL]; > + uint64_t tot_iter = histogram_samples(&s->pkts); > + uint64_t idle_iter = s->pkts.bin[0]; > + uint64_t busy_iter = tot_iter >= idle_iter ? tot_iter - idle_iter : 0; > + > + ds_put_format(str, > + " Cycles: %12"PRIu64" (%.2f GHz)\n" > + " Iterations: %12"PRIu64" (%.2f us/it)\n" > + " - idle: %12"PRIu64" (%4.1f %% cycles)\n" > + " - busy: %12"PRIu64" (%4.1f %% cycles)\n", > + tot_cycles, (tot_cycles / duration) / 1E9, > + tot_iter, tot_cycles * us_per_cycle / tot_iter, > + idle_iter, > + 100.0 * stats[PMD_CYCLES_ITER_IDLE] / tot_cycles, > + busy_iter, > + 100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles); > + if (packets > 0) { > + ds_put_format(str, > + " Packets: %12"PRIu64" (%.0f Kpps, %.0f cycles/pkt)\n" > + " Datapath passes: %12"PRIu64" (%.2f passes/pkt)\n" > + " - EMC hits: %12"PRIu64" (%4.1f %%)\n" > + " - Megaflow hits: %12"PRIu64" (%4.1f %%, %.2f subtbl lookups/" > + "hit)\n" > + " - Upcalls: %12"PRIu64" (%4.1f %%, %.1f us/upcall)\n" > + " - Lost upcalls: %12"PRIu64" (%4.1f %%)\n" > + "\n", > + packets, (packets / duration) / 1000, > + 1.0 * stats[PMD_CYCLES_ITER_BUSY] / packets, > + passes, packets ? 1.0 * passes / packets : 0, > + stats[PMD_STAT_EXACT_HIT], > + 100.0 * stats[PMD_STAT_EXACT_HIT] / passes, > + stats[PMD_STAT_MASKED_HIT], > + 100.0 * stats[PMD_STAT_MASKED_HIT] / passes, > + stats[PMD_STAT_MASKED_HIT] > + ? 1.0 * stats[PMD_STAT_MASKED_LOOKUP] / > stats[PMD_STAT_MASKED_HIT] > + : 0, > + upcalls, 100.0 * upcalls / passes, > + upcalls ? (upcall_cycles * us_per_cycle) / upcalls : 0, > + stats[PMD_STAT_LOST], > + 100.0 * stats[PMD_STAT_LOST] / passes); > + } else { > + ds_put_format(str, > + " Packets: %12"PRIu64"\n" > + "\n", > + 0UL); > + } > +} > + > +void > +pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s) > +{ > + int i; > + > + ds_put_cstr(str, "Histograms\n"); > + ds_put_format(str, > + " %-21s %-21s %-21s %-21s %-21s %-21s %-21s\n", > + "cycles/it", "packets/it", "cycles/pkt", "pkts/batch", > + "max vhost qlen", "upcalls/it", "cycles/upcall"); > + for (i = 0; i < NUM_BINS-1; i++) { > + ds_put_format(str, > + " %-9d %-11"PRIu64" %-9d %-11"PRIu64" %-9d %-11"PRIu64 > + " %-9d %-11"PRIu64" %-9d %-11"PRIu64" %-9d %-11"PRIu64 > + " %-9d %-11"PRIu64"\n", > + s->cycles.wall[i], s->cycles.bin[i], > + s->pkts.wall[i],s->pkts.bin[i], > + s->cycles_per_pkt.wall[i], s->cycles_per_pkt.bin[i], > + s->pkts_per_batch.wall[i], s->pkts_per_batch.bin[i], > + s->max_vhost_qfill.wall[i], s->max_vhost_qfill.bin[i], > + s->upcalls.wall[i], s->upcalls.bin[i], > + s->cycles_per_upcall.wall[i], s->cycles_per_upcall.bin[i]); > + } > + ds_put_format(str, > + " %-9s %-11"PRIu64" %-9s %-11"PRIu64" %-9s %-11"PRIu64 > + " %-9s %-11"PRIu64" %-9s %-11"PRIu64" %-9s %-11"PRIu64 > + " %-9s %-11"PRIu64"\n", > + ">", s->cycles.bin[i], > + ">", s->pkts.bin[i], > + ">", s->cycles_per_pkt.bin[i], > + ">", s->pkts_per_batch.bin[i], > + ">", s->max_vhost_qfill.bin[i], > + ">", s->upcalls.bin[i], > + ">", s->cycles_per_upcall.bin[i]); > + if (s->totals.iterations > 0) { > + ds_put_cstr(str, > + "-----------------------------------------------------" > + "-----------------------------------------------------" > + "------------------------------------------------\n"); > + ds_put_format(str, > + " %-21s %-21s %-21s %-21s %-21s %-21s %-21s\n", > + "cycles/it", "packets/it", "cycles/pkt", "pkts/batch", > + "vhost qlen", "upcalls/it", "cycles/upcall"); > + ds_put_format(str, > + " %-21"PRIu64" %-21.5f %-21"PRIu64 > + " %-21.5f %-21.5f %-21.5f %-21"PRIu32"\n", > + s->totals.cycles / s->totals.iterations, > + 1.0 * s->totals.pkts / s->totals.iterations, > + s->totals.pkts > + ? s->totals.busy_cycles / s->totals.pkts : 0, > + s->totals.batches > + ? 1.0 * s->totals.pkts / s->totals.batches : 0, > + 1.0 * s->totals.max_vhost_qfill / s->totals.iterations, > + 1.0 * s->totals.upcalls / s->totals.iterations, > + s->totals.upcalls > + ? s->totals.upcall_cycles / s->totals.upcalls : 0); > + } > +} > + > +void > +pmd_perf_format_iteration_history(struct ds *str, struct pmd_perf_stats *s, > + int n_iter) > +{ > + struct iter_stats *is; > + size_t index; > + int i; > + > + if (n_iter == 0) { > + return; > + } > + ds_put_format(str, " %-17s %-10s %-10s %-10s %-10s " > + "%-10s %-10s %-10s\n", > + "tsc", "cycles", "packets", "cycles/pkt", "pkts/batch", > + "vhost qlen", "upcalls", "cycles/upcall"); > + for (i = 1; i <= n_iter; i++) { > + index = (s->iterations.idx + HISTORY_LEN - i) % HISTORY_LEN; > + is = &s->iterations.sample[index]; > + ds_put_format(str, > + " %-17"PRIu64" %-11"PRIu64" %-11"PRIu32 > + " %-11"PRIu64" %-11"PRIu32" %-11"PRIu32 > + " %-11"PRIu32" %-11"PRIu32"\n", > + is->timestamp, > + is->cycles, > + is->pkts, > + is->pkts ? is->cycles / is->pkts : 0, > + is->batches ? is->pkts / is->batches : 0, > + is->max_vhost_qfill, > + is->upcalls, > + is->upcalls ? is->upcall_cycles / is->upcalls : 0); > + } > +} > + > +void > +pmd_perf_format_ms_history(struct ds *str, struct pmd_perf_stats *s, int > n_ms) > +{ > + struct iter_stats *is; > + size_t index; > + int i; > + > + if (n_ms == 0) { > + return; > + } > + ds_put_format(str, > + " %-12s %-10s %-10s %-10s %-10s" > + " %-10s %-10s %-10s %-10s\n", > + "ms", "iterations", "cycles/it", "Kpps", "cycles/pkt", > + "pkts/batch", "vhost qlen", "upcalls", "cycles/upcall"); > + for (i = 1; i <= n_ms; i++) { > + index = (s->milliseconds.idx + HISTORY_LEN - i) % HISTORY_LEN; > + is = &s->milliseconds.sample[index]; > + ds_put_format(str, > + " %-12"PRIu64" %-11"PRIu32" %-11"PRIu64 > + " %-11"PRIu32" %-11"PRIu64" %-11"PRIu32 > + " %-11"PRIu32" %-11"PRIu32" %-11"PRIu32"\n", > + is->timestamp, > + is->iterations, > + is->iterations ? is->cycles / is->iterations : 0, > + is->pkts, > + is->pkts ? is->busy_cycles / is->pkts : 0, > + is->batches ? is->pkts / is->batches : 0, > + is->iterations > + ? is->max_vhost_qfill / is->iterations : 0, > + is->upcalls, > + is->upcalls ? is->upcall_cycles / is->upcalls : 0); > + } > +} > + > +void > pmd_perf_read_counters(struct pmd_perf_stats *s, > uint64_t stats[PMD_N_STATS]) > { > @@ -59,8 +354,20 @@ void > pmd_perf_stats_clear(struct pmd_perf_stats *s) > { > s->start_ms = 0; /* Marks start of clearing. */ > + memset(&s->current, 0 , sizeof(struct iter_stats)); > + memset(&s->totals, 0 , sizeof(struct iter_stats)); > for (int i = 0; i < PMD_N_STATS; i++) { > atomic_read_relaxed(&s->counters.n[i], &s->counters.zero[i]); > } > + histogram_clear(&s->cycles); > + histogram_clear(&s->pkts); > + histogram_clear(&s->cycles_per_pkt); > + histogram_clear(&s->upcalls); > + histogram_clear(&s->cycles_per_upcall); > + histogram_clear(&s->pkts_per_batch); > + histogram_clear(&s->max_vhost_qfill); > + history_init(&s->iterations); > + history_init(&s->milliseconds); > s->start_ms = time_msec(); /* Clearing finished. */ > + s->milliseconds.sample[0].timestamp = s->start_ms; > } > diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h > index fa5cf37..3611723 100644 > --- a/lib/dpif-netdev-perf.h > +++ b/lib/dpif-netdev-perf.h > @@ -33,6 +33,11 @@ > extern "C" { > #endif > > +#define NUM_BINS 32 /* Number of histogram bins. */ > +#define HISTORY_LEN 1000 /* Length of recorded history > + (iterations and ms). */ > +#define DEF_HIST_SHOW 20 /* Default number of history samples. */ > + > enum pmd_stat_type { > PMD_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */ > PMD_STAT_MASKED_HIT, /* Packets that matched in the flow table. */ > @@ -62,6 +67,28 @@ enum pmd_stat_type { > PMD_N_STATS > }; > > +struct histogram { > + uint32_t wall[NUM_BINS]; > + uint64_t bin[NUM_BINS]; > +}; > + > +struct iter_stats { > + uint64_t timestamp; > + uint64_t cycles; > + uint64_t busy_cycles; > + uint32_t iterations; > + uint32_t pkts; > + uint32_t upcalls; > + uint32_t upcall_cycles; > + uint32_t batches; > + uint32_t max_vhost_qfill; > +}; > + > +struct history { > + uint64_t idx; > + struct iter_stats sample[HISTORY_LEN]; > +}; > + > struct pmd_counters { > atomic_uint64_t n[PMD_N_STATS]; /* Indexed by PMD_STAT_*. */ > uint64_t zero[PMD_N_STATS]; > @@ -71,6 +98,17 @@ struct pmd_perf_stats { > uint64_t start_ms; > uint64_t last_tsc; > struct pmd_counters counters; > + struct iter_stats current; > + struct iter_stats totals; > + struct histogram cycles; > + struct histogram pkts; > + struct histogram cycles_per_pkt; > + struct histogram upcalls; > + struct histogram cycles_per_upcall; > + struct histogram pkts_per_batch; > + struct histogram max_vhost_qfill; > + struct history iterations; > + struct history milliseconds; > }; > > enum pmd_info_type; > @@ -98,6 +136,63 @@ pmd_perf_update_counter(struct pmd_perf_stats *s, > atomic_store_relaxed(&s->counters.n[counter], tmp); > } > > +void pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s, > + double duration); > +void pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s); > +void pmd_perf_format_iteration_history(struct ds *str, > + struct pmd_perf_stats *s, > + int n_iter); > +void pmd_perf_format_ms_history(struct ds *str, struct pmd_perf_stats *s, > + int n_ms); > + > +void pmd_perf_show(struct unixctl_conn *conn, int argc, > + const char *argv[], > + void *aux OVS_UNUSED); > +static inline void > +histogram_add_sample(struct histogram *hist, uint32_t val) > +{ > + /* TODO: Can do better with binary search? */ > + for (int i = 0; i < NUM_BINS-1; i++) { > + if (val <= hist->wall[i]) { > + hist->bin[i]++; > + return; > + } > + } > + hist->bin[NUM_BINS-1]++; > +} > + > +uint64_t histogram_samples(const struct histogram *hist); > + > +static inline struct iter_stats * > +history_current(struct history *h) > +{ > + return &h->sample[h->idx]; > +} > + > +static inline struct iter_stats * > +history_next(struct history *h) > +{ > + struct iter_stats *next; > + > + h->idx++; > + if (h->idx == HISTORY_LEN) { > + h->idx = 0; > + } > + next = &h->sample[h->idx]; > + memset(next, 0, sizeof(*next)); > + return next; > +} > + > +static inline struct iter_stats * > +history_store(struct history *h, struct iter_stats *is) > +{ > + if (is) { > + h->sample[h->idx] = *is; > + } > + /* Advance the history pointer */ > + return history_next(h); > +} > + > static inline void > pmd_perf_start_iteration(struct pmd_perf_stats *s, uint64_t now_tsc) > { > @@ -105,14 +200,25 @@ pmd_perf_start_iteration(struct pmd_perf_stats *s, > uint64_t now_tsc) > /* Stats not yet fully initialised. */ > return; > } > - s->last_tsc = now_tsc; > + memset(&s->current, 0, sizeof(struct iter_stats)); > + s->current.timestamp = now_tsc; > } > > static inline void > pmd_perf_end_iteration(struct pmd_perf_stats *s, uint64_t now_tsc, > - int packets) > + int packets, bool full_metrics) > { > - uint64_t cycles = now_tsc - s->last_tsc; > + struct iter_stats *cum_ms; > + uint64_t cycles, cycles_per_pkt = 0; > + > + if (OVS_UNLIKELY(s->current.timestamp == 0)) { > + /* Stats were cleared during the ongoing iteration. */ > + return; > + } > + > + cycles = now_tsc - s->current.timestamp; > + s->current.cycles = cycles; > + s->current.pkts = packets; > > /* No need for atomic updates as only invoked within PMD thread. */ > if (packets > 0) { > @@ -120,7 +226,68 @@ pmd_perf_end_iteration(struct pmd_perf_stats *s, > uint64_t now_tsc, > } else { > s->counters.n[PMD_CYCLES_ITER_IDLE] += cycles; > } > + /* Add iteration sample to histograms. */ > + histogram_add_sample(&s->cycles, cycles); > + histogram_add_sample(&s->pkts, packets); > + > + if (!full_metrics) { > + return; > + } > + > + s->counters.n[PMD_CYCLES_UPCALL] += s->current.upcall_cycles; > + > + if (packets > 0) { > + cycles_per_pkt = cycles / packets; > + histogram_add_sample(&s->cycles_per_pkt, cycles_per_pkt); > + } > + if (s->current.batches > 0) { > + histogram_add_sample(&s->pkts_per_batch, packets / > s->current.batches); > + } > + histogram_add_sample(&s->upcalls, s->current.upcalls); > + if (s->current.upcalls > 0) { > + histogram_add_sample(&s->cycles_per_upcall, > + s->current.upcall_cycles / s->current.upcalls); > + } > + histogram_add_sample(&s->max_vhost_qfill, s->current.max_vhost_qfill); > + > + /* Add iteration samples to millisecond stats. */ > + cum_ms = history_current(&s->milliseconds); > + cum_ms->iterations++; > + cum_ms->cycles += cycles; > + if (packets > 0) { > + cum_ms->busy_cycles += cycles; > + } > + cum_ms->pkts += s->current.pkts; > + cum_ms->upcalls += s->current.upcalls; > + cum_ms->upcall_cycles += s->current.upcall_cycles; > + cum_ms->batches += s->current.batches; > + cum_ms->max_vhost_qfill += s->current.max_vhost_qfill; > + > + /* Store in iteration history. */ > + history_store(&s->iterations, &s->current); > + if (now_tsc - s->last_tsc > 10000) { > + /* Check if ms is completed and store in milliseconds history. */ > + uint64_t now = time_msec(); > + if (now != cum_ms->timestamp) { > + /* Add ms stats to totals. */ > + s->totals.iterations += cum_ms->iterations; > + s->totals.cycles += cum_ms->cycles; > + s->totals.busy_cycles += cum_ms->busy_cycles; > + s->totals.pkts += cum_ms->pkts; > + s->totals.upcalls += cum_ms->upcalls; > + s->totals.upcall_cycles += cum_ms->upcall_cycles; > + s->totals.batches += cum_ms->batches; > + s->totals.max_vhost_qfill += cum_ms->max_vhost_qfill; > + cum_ms = history_next(&s->milliseconds); > + cum_ms->timestamp = now; > + } > + s->last_tsc = now_tsc; > + } > +} > + > +#ifdef __cplusplus > } > +#endif > > #ifdef __cplusplus > } > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index 8b9698a..8f64df3 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -53,6 +53,7 @@ > #include "id-pool.h" > #include "latch.h" > #include "netdev.h" > +#include "netdev-provider.h" > #include "netdev-vport.h" > #include "netlink.h" > #include "odp-execute.h" > @@ -278,6 +279,8 @@ struct dp_netdev { > > /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/ > OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min; > + /* Enable collection of PMD performance metrics. */ > + ATOMIC(bool) pmd_perf_metrics; > > /* Protects access to ofproto-dpif-upcall interface during revalidator > * thread synchronization. */ > @@ -703,6 +706,7 @@ static inline bool emc_entry_alive(struct emc_entry *ce); > static void emc_clear_entry(struct emc_entry *ce); > > static void dp_netdev_request_reconfigure(struct dp_netdev *dp); > +static inline bool pmd_perf_metrics_enabled(const struct > dp_netdev_pmd_thread *pmd); > > static void > emc_cache_init(struct emc_cache *flow_cache) > @@ -786,7 +790,8 @@ get_dp_netdev(const struct dpif *dpif) > enum pmd_info_type { > PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */ > PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */ > - PMD_INFO_SHOW_RXQ /* Show poll-lists of pmd threads. */ > + PMD_INFO_SHOW_RXQ, /* Show poll-lists of pmd threads. */ > + PMD_INFO_PERF_SHOW, /* Show pmd performance details. */ > }; > > static void > @@ -873,8 +878,46 @@ pmd_info_show_stats(struct ds *reply, > ds_put_format(reply, > "\tavg processing cycles per packet: " > "%.02f (%"PRIu64"/%"PRIu64")\n", > - stats[PMD_CYCLES_POLL_BUSY] / (double)total_packets, > - stats[PMD_CYCLES_POLL_BUSY], total_packets); > + stats[PMD_CYCLES_ITER_BUSY] / (double)total_packets, > + stats[PMD_CYCLES_ITER_BUSY], total_packets); > +} > + > +static void > +pmd_info_show_perf(struct ds *reply, > + struct dp_netdev_pmd_thread *pmd, > + struct pmd_perf_params *par) > +{ > + if (pmd->core_id != NON_PMD_CORE_ID) { > + char *time_str = > + xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true); > + long long now = time_msec(); > + double duration = (now - pmd->perf_stats.start_ms) / 1000.0; > + > + ds_put_cstr(reply, "\n"); > + ds_put_format(reply, "Time: %s\n", time_str); > + ds_put_format(reply, "Measurement duration: %.3f s\n", duration); > + ds_put_cstr(reply, "\n"); > + format_pmd_thread(reply, pmd); > + ds_put_cstr(reply, "\n"); > + pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration); > + if (pmd_perf_metrics_enabled(pmd)) { > + if (par->histograms) { > + ds_put_cstr(reply, "\n"); > + pmd_perf_format_histograms(reply, &pmd->perf_stats); > + } > + if (par->iter_hist_len > 0) { > + ds_put_cstr(reply, "\n"); > + pmd_perf_format_iteration_history(reply, &pmd->perf_stats, > + par->iter_hist_len); > + } > + if (par->ms_hist_len > 0) { > + ds_put_cstr(reply, "\n"); > + pmd_perf_format_ms_history(reply, &pmd->perf_stats, > + par->ms_hist_len); > + } > + } > + free(time_str); > + } > } > > static int > @@ -1079,6 +1122,8 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int > argc, const char *argv[], > pmd_perf_stats_clear(&pmd->perf_stats); > } else if (type == PMD_INFO_SHOW_STATS) { > pmd_info_show_stats(&reply, pmd); > + } else if (type == PMD_INFO_PERF_SHOW) { > + pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux); > } > } > free(pmd_list); > @@ -1088,6 +1133,48 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int > argc, const char *argv[], > unixctl_command_reply(conn, ds_cstr(&reply)); > ds_destroy(&reply); > } > + > +static void > +pmd_perf_show_cmd(struct unixctl_conn *conn, int argc, > + const char *argv[], > + void *aux OVS_UNUSED) > +{ > + struct pmd_perf_params par; > + long int it_hist = 0, ms_hist = 0; > + par.histograms = true; > + > + while (argc > 1) { > + if (!strcmp(argv[1], "-nh")) { > + par.histograms = false; > + argc -= 1; > + argv += 1; > + } else if (!strcmp(argv[1], "-it") && argc > 2) { > + it_hist = strtol(argv[2], NULL, 10); > + if (it_hist < 0) { > + it_hist = 0; > + } else if (it_hist > HISTORY_LEN) { > + it_hist = HISTORY_LEN; > + } > + argc -= 2; > + argv += 2; > + } else if (!strcmp(argv[1], "-ms") && argc > 2) { > + ms_hist = strtol(argv[2], NULL, 10); > + if (ms_hist < 0) { > + ms_hist = 0; > + } else if (ms_hist > HISTORY_LEN) { > + ms_hist = HISTORY_LEN; > + } > + argc -= 2; > + argv += 2; > + } else { > + break; > + } > + } > + par.iter_hist_len = it_hist; > + par.ms_hist_len = ms_hist; > + par.command_type = PMD_INFO_PERF_SHOW; > + dpif_netdev_pmd_info(conn, argc, argv, &par); > +} > > static int > dpif_netdev_init(void) > @@ -1105,6 +1192,12 @@ dpif_netdev_init(void) > unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core | dp]", > 0, 2, dpif_netdev_pmd_info, > (void *)&poll_aux); > + unixctl_command_register("dpif-netdev/pmd-perf-show", > + "[-nh] [-it iter-history-len]" > + " [-ms ms-history-len]" > + " [-pmd core | dp]", > + 0, 7, pmd_perf_show_cmd, > + NULL); > unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]", > 0, 1, dpif_netdev_pmd_rebalance, > NULL); > @@ -2980,6 +3073,18 @@ dpif_netdev_set_config(struct dpif *dpif, const struct > smap *other_config) > } > } > > + bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", > false); > + bool cur_perf_enabled; > + atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled); > + if (perf_enabled != cur_perf_enabled) { > + atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled); > + if (perf_enabled) { > + VLOG_INFO("PMD performance metrics collection enabled"); > + } else { > + VLOG_INFO("PMD performance metrics collection disabled"); > + } > + } > + > return 0; > } > > @@ -3160,6 +3265,14 @@ cycles_count_intermediate(struct dp_netdev_pmd_thread > *pmd, > } > } > > +static inline bool > +pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd) > +{ > + bool pmd_perf_enabled; > + atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled); > + return pmd_perf_enabled; > +} > + > static void > dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx, > enum rxq_cycles_counter_type type, > @@ -3236,6 +3349,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread > *pmd, > struct netdev_rxq *rx, > odp_port_t port_no) > { > + struct pmd_perf_stats *s = &pmd->perf_stats; > struct dp_packet_batch batch; > int error; > int batch_cnt = 0; > @@ -3245,8 +3359,25 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread > *pmd, > if (!error) { > *recirc_depth_get() = 0; > pmd_thread_ctx_time_update(pmd); > - > batch_cnt = batch.count; > + if (pmd_perf_metrics_enabled(pmd)) { > + /* Update batch histogram. */ > + s->current.batches++; > + histogram_add_sample(&s->pkts_per_batch, batch_cnt); > + /* Update the maximum Rx queue fill level. */ > + uint32_t qfill = batch.qfill; > + switch (netdev_dpdk_get_type(netdev_rxq_get_netdev(rx))) { > + case DPDK_DEV_VHOST: > + if (qfill > s->current.max_vhost_qfill) { > + s->current.max_vhost_qfill = qfill; > + } > + break; > + case DPDK_DEV_ETH: > + default: > + break; > + } I'm sorry, but I'll vote to not accept changes like this. This is contraversal to all the modular object-oriented architecure of OVS. In the worst case you may pass something from this layer to be filled by netdev. But I don't really like such approach too. > + } > + /* Process packet batch. */ > dp_netdev_input(pmd, &batch, port_no); > dp_netdev_pmd_flush_output_packets(pmd); > } else if (error != EAGAIN && error != EOPNOTSUPP) { > @@ -4084,7 +4215,9 @@ reload: > cycles_count_start(pmd); > for (;;) { > uint64_t iter_packets = 0; > + > pmd_perf_start_iteration(s, pmd->ctx.last_cycles); > + > for (i = 0; i < poll_cnt; i++) { > process_packets = > dp_netdev_process_rxq_port(pmd, poll_list[i].rxq->rx, > @@ -4116,7 +4249,9 @@ reload: > } > cycles_count_intermediate(pmd, NULL, PMD_CYCLES_OVERHEAD); > } > - pmd_perf_end_iteration(s, pmd->ctx.last_cycles, iter_packets); > + > + pmd_perf_end_iteration(s, pmd->ctx.last_cycles, iter_packets, > + pmd_perf_metrics_enabled(pmd)); > } > > cycles_count_end(pmd, PMD_CYCLES_OVERHEAD); > @@ -5012,6 +5147,7 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, > struct match match; > ovs_u128 ufid; > int error; > + uint64_t cycles = cycles_counter(); > > match.tun_md.valid = false; > miniflow_expand(&key->mf, &match.flow); > @@ -5065,9 +5201,15 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, > ovs_mutex_unlock(&pmd->flow_mutex); > emc_probabilistic_insert(pmd, key, netdev_flow); > } > - /* Only error ENOSPC can reach here. We process the packet but do not > - * install a datapath flow. Treat as successful. */ > - return 0; > + if (pmd_perf_metrics_enabled(pmd)) { > + /* Update upcall stats. */ > + cycles = cycles_counter() - cycles; > + struct pmd_perf_stats *s = &pmd->perf_stats; > + s->current.upcalls++; > + s->current.upcall_cycles += cycles; > + histogram_add_sample(&s->cycles_per_upcall, cycles); Hmm. Why this is done manually? Maybe PMD_CYCLES_UPCALL should be used here? > + } > + return error; > } > > static inline void > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c > index 364f545..d701621 100644 > --- a/lib/netdev-dpdk.c > +++ b/lib/netdev-dpdk.c > @@ -36,6 +36,7 @@ > #include <rte_mbuf.h> > #include <rte_meter.h> > #include <rte_pci.h> > +#include <rte_version.h> > #include <rte_vhost.h> > #include <rte_version.h> > > @@ -188,11 +189,6 @@ enum { DPDK_RING_SIZE = 256 }; > BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE)); > enum { DRAIN_TSC = 200000ULL }; > > -enum dpdk_dev_type { > - DPDK_DEV_ETH = 0, > - DPDK_DEV_VHOST = 1, > -}; > - > /* Quality of Service */ > > /* An instance of a QoS configuration. Always associated with a particular > @@ -835,6 +831,13 @@ netdev_dpdk_cast(const struct netdev *netdev) > return CONTAINER_OF(netdev, struct netdev_dpdk, up); > } > > +enum dpdk_dev_type > +netdev_dpdk_get_type(const struct netdev *netdev) > +{ > + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); > + return dev->type; > +} > + > static struct netdev * > netdev_dpdk_alloc(void) > { > @@ -1644,6 +1647,14 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq, > return EAGAIN; > } > > + batch->qfill = nb_rx; > + > + if (OVS_UNLIKELY(nb_rx == NETDEV_MAX_BURST)) { > + batch->qfill += rte_vhost_rx_queue_count(netdev_dpdk_get_vid(dev), > + qid * VIRTIO_QNUM > + + VIRTIO_TXQ); This looks strange. Isn't it illogical to spend addtitional time requesting queue count if we're already late in packet processing? > + } > + > if (policer) { > dropped = nb_rx; > nb_rx = ingress_policer_run(policer, > @@ -1683,6 +1694,8 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct > dp_packet_batch *batch) > return EAGAIN; > } > > + batch->qfill = nb_rx; > + > if (policer) { > dropped = nb_rx; > nb_rx = ingress_policer_run(policer, > diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h > index b7d02a7..2b357db 100644 > --- a/lib/netdev-dpdk.h > +++ b/lib/netdev-dpdk.h > @@ -22,11 +22,18 @@ > #include "openvswitch/compiler.h" > > struct dp_packet; > +struct netdev; > + > +enum dpdk_dev_type { > + DPDK_DEV_ETH = 0, > + DPDK_DEV_VHOST = 1, > +}; > > #ifdef DPDK_NETDEV > > void netdev_dpdk_register(void); > void free_dpdk_buf(struct dp_packet *); > +enum dpdk_dev_type netdev_dpdk_get_type(const struct netdev *netdev); > > #else > > @@ -41,6 +48,13 @@ free_dpdk_buf(struct dp_packet *buf OVS_UNUSED) > /* Nothing */ > } > > +static inline enum dpdk_dev_type > +netdev_dpdk_get_type(const struct netdev *netdev OVS_UNUSED) > +{ > + /* Nothing to do. Return value zero to make compiler happy. */ > + return DPDK_DEV_ETH; > +} > + > #endif > > #endif /* netdev-dpdk.h */ > diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c > index 43b7b89..3845b12 100644 > --- a/ofproto/ofproto-dpif.c > +++ b/ofproto/ofproto-dpif.c > @@ -5317,7 +5317,8 @@ dpif_show_backer(const struct dpif_backer *backer, > struct ds *ds) > > dpif_get_dp_stats(backer->dpif, &dp_stats); > ds_put_format(ds, "%s: hit:%"PRIu64" missed:%"PRIu64"\n", > - dpif_name(backer->dpif), dp_stats.n_hit, > dp_stats.n_missed); > + dpif_name(backer->dpif), > + dp_stats.n_hit + dp_stats.n_mask_hit, dp_stats.n_missed); > > shash_init(&ofproto_shash); > ofprotos = get_ofprotos(&ofproto_shash); > _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev