On 09.01.2018 12:00, Ilya Maximets wrote: > +Kevin, +Ian > > On 09.01.2018 11:53, Ilya Maximets wrote: > Comments inline. > > On 04.01.2018 15:07, Jan Scheurich wrote:> This patch instruments the > dpif-netdev datapath to record detailed >> statistics of what is happening in every iteration of a PMD thread. >> >> The collection of detailed statistics can be controlled by a new >> configuration parameter "other_config:pmd-perf-metrics". By default it >> is disabled. The run-time overhead, when enabled, is in the order of 1%. >> >> The covered metrics per iteration are: >> - cycles >> - packets >> - (rx) batches >> - packets/batch >> - max. vhostuser qlen >> - upcalls >> - cycles spent in upcalls >> >> This raw recorded data is used threefold: >> >> 1. In histograms for each of the following metrics: >> - cycles/iteration (log.) >> - packets/iteration (log.) >> - cycles/packet >> - packets/batch >> - max. vhostuser qlen (log.) >> - upcalls >> - cycles/upcall (log) >> The histograms bins are divided linear or logarithmic. >> >> 2. A cyclic history of the above statistics for 1024 iterations >> >> 3. A cyclic history of the cummulative/average values per millisecond >> wall clock for the last 1024 milliseconds: >> - number of iterations >> - avg. cycles/iteration >> - packets (Kpps) >> - avg. packets/batch >> - avg. max vhost qlen >> - upcalls >> - avg. cycles/upcall >> >> The gathered performance statists can be printed at any time with the >> new CLI command >> >> ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len] >> [-pmd core] [dp] >> >> The options are >> >> -nh: Suppress the histograms >> -it iter_len: Display the last iter_len iteration stats >> -ms ms_len: Display the last ms_len millisecond stats >> -pmd core: Display only >> >> The performance statistics are reset with the existing >> dpif-netdev/pmd-stats-clear command. >> >> The output always contains the following global PMD statistics, >> similar to the pmd-stats-show command: >> >> Time: 15:24:55.270 >> Measurement duration: 1.008 s >> >> pmd thread numa_id 0 core_id 1: >> >> Cycles: 2419034712 (2.40 GHz) >> Iterations: 572817 (1.76 us/it) >> - idle: 486808 (15.9 % cycles) >> - busy: 86009 (84.1 % cycles) >> Packets: 2399607 (2381 Kpps, 848 cycles/pkt) >> Datapath passes: 3599415 (1.50 passes/pkt) >> - EMC hits: 336472 ( 9.3 %) >> - Megaflow hits: 3262943 (90.7 %, 1.00 subtbl lookups/hit) >> - Upcalls: 0 ( 0.0 %, 0.0 us/upcall) >> - Lost upcalls: 0 ( 0.0 %) >> >> Signed-off-by: Jan Scheurich <jan.scheur...@ericsson.com> >> --- >> lib/dp-packet.h | 2 + >> lib/dpif-netdev-perf.c | 309 >> ++++++++++++++++++++++++++++++++++++++++++++++++- >> lib/dpif-netdev-perf.h | 173 ++++++++++++++++++++++++++- >> lib/dpif-netdev.c | 158 +++++++++++++++++++++++-- >> lib/netdev-dpdk.c | 23 +++- >> lib/netdev-dpdk.h | 14 +++ >> ofproto/ofproto-dpif.c | 3 +- >> 7 files changed, 664 insertions(+), 18 deletions(-) >> >> diff --git a/lib/dp-packet.h b/lib/dp-packet.h >> index b4b721c..7950247 100644 >> --- a/lib/dp-packet.h >> +++ b/lib/dp-packet.h >> @@ -695,8 +695,10 @@ enum { NETDEV_MAX_BURST = 32 }; /* Maximum number >> packets in a batch. */ >> >> struct dp_packet_batch { >> size_t count; >> + size_t qfill; /* Number of packets in queue when reading rx burst. */ > This change looks completely wrog for me. 'qfill' should not be here. > This and all the related stuff in further code turns internatls of netdev-dpdk > inside out and pushes them to the higher level. I don't like this. > > Also, currently there are at least 3 patch-sets in OVS mail list targeted to > get > some statisics and performance counters. > Can we just use approach similar to "netdev: Custom statistics." ? > (https://mail.openvswitch.org/pipermail/ovs-dev/2018-January/342724.html) > > We may collect drops or count high values of qfill using coverage counters or > in some other way. > > 'qfill', IMHO, should not belong to generic dp_packet_batch structure. > Butches are not for this. > >> bool trunc; /* true if the batch needs truncate. */ >> struct dp_packet *packets[NETDEV_MAX_BURST]; >> + > Not needed empty line. > >> }; >> >> static inline void >> diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c >> index 7d8b7b2..a66a48c 100644 >> --- a/lib/dpif-netdev-perf.c >> +++ b/lib/dpif-netdev-perf.c >> @@ -15,6 +15,7 @@ >> */ >> >> #include <config.h> >> +#include <stdint.h> >> >> #ifdef DPDK_NETDEV >> #include <rte_cycles.h> >> @@ -27,13 +28,307 @@ >> >> VLOG_DEFINE_THIS_MODULE(pmd_perf); >> >> +#ifdef DPDK_NETDEV >> +static uint64_t >> +get_tsc_hz(void) >> +{ >> + return rte_get_tsc_hz(); >> +} >> +#else >> +static uint64_t >> +read_tsc(void) >> +{ >> + register uint64_t tsc asm("eax"); >> + asm volatile (".byte 15, 49" : : : "eax", "edx"); >> + return tsc; >> +} > This breaks non-x86 build: > > lib/dpif-netdev-perf.c:58:31: error: unknown register name 'eax' in asm > register uint64_t tsc asm("eax"); > ^ > lib/dpif-netdev-perf.c:59:40: error: unknown register name 'eax' in asm > asm volatile (".byte 15, 49" : : : "eax", "edx"); > >> + >> +static uint64_t >> +get_tsc_hz(void) >> +{ >> + uint64_t tsc_start = read_tsc(); >> + xsleep(1); >> + return read_tsc() - tsc_start; >> +} >> +#endif >> + >> +static void >> +histogram_walls_set_lin(struct histogram *hist, uint32_t min, uint32_t max) >> +{ >> + int i; >> + >> + ovs_assert(min < max); >> + for (i = 0; i < NUM_BINS-1; i++) { >> + hist->wall[i] = min + (i * (max - min)) / (NUM_BINS - 2); >> + } >> + hist->wall[NUM_BINS-1] = UINT32_MAX; >> +} >> + >> +static void >> +histogram_walls_set_log(struct histogram *hist, uint32_t min, uint32_t max) >> +{ >> + int i, start, bins, wall; >> + double log_min, log_max; >> + >> + ovs_assert(min < max); >> + if (min > 0) { >> + log_min = log(min); >> + log_max = log(max); >> + start = 0; >> + bins = NUM_BINS - 1; >> + } else { >> + hist->wall[0] = 0; >> + log_min = log(1); >> + log_max = log(max); >> + start = 1; >> + bins = NUM_BINS - 2; >> + } >> + wall = start; >> + for (i = 0; i < bins; i++) { >> + /* Make sure each wall is monotonically increasing. */ >> + wall = MAX(wall, exp(log_min + (i * (log_max - log_min)) / >> (bins-1))); >> + hist->wall[start + i] = wall++; >> + } >> + if (hist->wall[NUM_BINS-2] < max) { >> + hist->wall[NUM_BINS-2] = max; >> + } >> + hist->wall[NUM_BINS-1] = UINT32_MAX; >> +} >> + >> +uint64_t >> +histogram_samples(const struct histogram *hist) >> +{ >> + uint64_t samples = 0; >> + >> + for (int i = 0; i < NUM_BINS; i++) { >> + samples += hist->bin[i]; >> + } >> + return samples; >> +} >> + >> +static void >> +histogram_clear(struct histogram *hist) >> +{ >> + int i; >> + >> + for (i = 0; i < NUM_BINS; i++) { >> + hist->bin[i] = 0; >> + } >> +} >> + >> +static void >> +history_init(struct history *h) >> +{ >> + memset(h, 0, sizeof(*h)); >> +} >> + >> void >> pmd_perf_stats_init(struct pmd_perf_stats *s) { >> - memset(s, 0 , sizeof(*s)); >> + /* The struct has been zeroed at allocation. */ >> + histogram_walls_set_log(&s->cycles, 500, 24000000); >> + histogram_walls_set_log(&s->pkts, 0, 1000); >> + histogram_walls_set_lin(&s->cycles_per_pkt, 100, 30000); >> + histogram_walls_set_lin(&s->pkts_per_batch, 0, 32); >> + histogram_walls_set_lin(&s->upcalls, 0, 30); >> + histogram_walls_set_log(&s->cycles_per_upcall, 1000, 1000000); >> + histogram_walls_set_log(&s->max_vhost_qfill, 0, 512); >> s->start_ms = time_msec(); >> } >> >> void >> +pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s, >> + double duration) >> +{ >> + uint64_t stats[PMD_N_STATS]; >> + double us_per_cycle = 1000000.0 / get_tsc_hz(); >> + >> + if (duration == 0) { >> + return; >> + } >> + >> + pmd_perf_read_counters(s, stats); >> + uint64_t tot_cycles = stats[PMD_CYCLES_ITER_IDLE] + >> + stats[PMD_CYCLES_ITER_BUSY]; >> + uint64_t packets = stats[PMD_STAT_RECV]; >> + uint64_t passes = stats[PMD_STAT_RECV] + >> + stats[PMD_STAT_RECIRC]; >> + uint64_t upcalls = stats[PMD_STAT_MISS]; >> + uint64_t upcall_cycles = stats[PMD_CYCLES_UPCALL]; >> + uint64_t tot_iter = histogram_samples(&s->pkts); >> + uint64_t idle_iter = s->pkts.bin[0]; >> + uint64_t busy_iter = tot_iter >= idle_iter ? tot_iter - idle_iter : 0; >> + >> + ds_put_format(str, >> + " Cycles: %12"PRIu64" (%.2f GHz)\n" >> + " Iterations: %12"PRIu64" (%.2f us/it)\n" >> + " - idle: %12"PRIu64" (%4.1f %% cycles)\n" >> + " - busy: %12"PRIu64" (%4.1f %% cycles)\n", >> + tot_cycles, (tot_cycles / duration) / 1E9, >> + tot_iter, tot_cycles * us_per_cycle / tot_iter, >> + idle_iter, >> + 100.0 * stats[PMD_CYCLES_ITER_IDLE] / tot_cycles, >> + busy_iter, >> + 100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles); >> + if (packets > 0) { >> + ds_put_format(str, >> + " Packets: %12"PRIu64" (%.0f Kpps, %.0f cycles/pkt)\n" >> + " Datapath passes: %12"PRIu64" (%.2f passes/pkt)\n" >> + " - EMC hits: %12"PRIu64" (%4.1f %%)\n" >> + " - Megaflow hits: %12"PRIu64" (%4.1f %%, %.2f subtbl >> lookups/" >> + >> "hit)\n" >> + " - Upcalls: %12"PRIu64" (%4.1f %%, %.1f us/upcall)\n" >> + " - Lost upcalls: %12"PRIu64" (%4.1f %%)\n" >> + "\n", >> + packets, (packets / duration) / 1000, >> + 1.0 * stats[PMD_CYCLES_ITER_BUSY] / packets, >> + passes, packets ? 1.0 * passes / packets : 0, >> + stats[PMD_STAT_EXACT_HIT], >> + 100.0 * stats[PMD_STAT_EXACT_HIT] / passes, >> + stats[PMD_STAT_MASKED_HIT], >> + 100.0 * stats[PMD_STAT_MASKED_HIT] / passes, >> + stats[PMD_STAT_MASKED_HIT] >> + ? 1.0 * stats[PMD_STAT_MASKED_LOOKUP] / >> stats[PMD_STAT_MASKED_HIT] >> + : 0, >> + upcalls, 100.0 * upcalls / passes, >> + upcalls ? (upcall_cycles * us_per_cycle) / upcalls : 0, >> + stats[PMD_STAT_LOST], >> + 100.0 * stats[PMD_STAT_LOST] / passes); >> + } else { >> + ds_put_format(str, >> + " Packets: %12"PRIu64"\n" >> + "\n", >> + 0UL); >> + } >> +} >> + >> +void >> +pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s) >> +{ >> + int i; >> + >> + ds_put_cstr(str, "Histograms\n"); >> + ds_put_format(str, >> + " %-21s %-21s %-21s %-21s %-21s %-21s %-21s\n", >> + "cycles/it", "packets/it", "cycles/pkt", "pkts/batch", >> + "max vhost qlen", "upcalls/it", "cycles/upcall"); >> + for (i = 0; i < NUM_BINS-1; i++) { >> + ds_put_format(str, >> + " %-9d %-11"PRIu64" %-9d %-11"PRIu64" %-9d %-11"PRIu64 >> + " %-9d %-11"PRIu64" %-9d %-11"PRIu64" %-9d %-11"PRIu64 >> + " %-9d %-11"PRIu64"\n", >> + s->cycles.wall[i], s->cycles.bin[i], >> + s->pkts.wall[i],s->pkts.bin[i], >> + s->cycles_per_pkt.wall[i], s->cycles_per_pkt.bin[i], >> + s->pkts_per_batch.wall[i], s->pkts_per_batch.bin[i], >> + s->max_vhost_qfill.wall[i], s->max_vhost_qfill.bin[i], >> + s->upcalls.wall[i], s->upcalls.bin[i], >> + s->cycles_per_upcall.wall[i], s->cycles_per_upcall.bin[i]); >> + } >> + ds_put_format(str, >> + " %-9s %-11"PRIu64" %-9s %-11"PRIu64" %-9s %-11"PRIu64 >> + " %-9s %-11"PRIu64" %-9s %-11"PRIu64" %-9s %-11"PRIu64 >> + " %-9s %-11"PRIu64"\n", >> + ">", s->cycles.bin[i], >> + ">", s->pkts.bin[i], >> + ">", s->cycles_per_pkt.bin[i], >> + ">", s->pkts_per_batch.bin[i], >> + ">", s->max_vhost_qfill.bin[i], >> + ">", s->upcalls.bin[i], >> + ">", s->cycles_per_upcall.bin[i]); >> + if (s->totals.iterations > 0) { >> + ds_put_cstr(str, >> + "-----------------------------------------------------" >> + "-----------------------------------------------------" >> + "------------------------------------------------\n"); >> + ds_put_format(str, >> + " %-21s %-21s %-21s %-21s %-21s %-21s >> %-21s\n", >> + "cycles/it", "packets/it", "cycles/pkt", "pkts/batch", >> + "vhost qlen", "upcalls/it", "cycles/upcall"); >> + ds_put_format(str, >> + " %-21"PRIu64" %-21.5f %-21"PRIu64 >> + " %-21.5f %-21.5f %-21.5f %-21"PRIu32"\n", >> + s->totals.cycles / s->totals.iterations, >> + 1.0 * s->totals.pkts / s->totals.iterations, >> + s->totals.pkts >> + ? s->totals.busy_cycles / s->totals.pkts : 0, >> + s->totals.batches >> + ? 1.0 * s->totals.pkts / s->totals.batches : 0, >> + 1.0 * s->totals.max_vhost_qfill / >> s->totals.iterations, >> + 1.0 * s->totals.upcalls / s->totals.iterations, >> + s->totals.upcalls >> + ? s->totals.upcall_cycles / s->totals.upcalls : >> 0); >> + } >> +} >> + >> +void >> +pmd_perf_format_iteration_history(struct ds *str, struct pmd_perf_stats *s, >> + int n_iter) >> +{ >> + struct iter_stats *is; >> + size_t index; >> + int i; >> + >> + if (n_iter == 0) { >> + return; >> + } >> + ds_put_format(str, " %-17s %-10s %-10s %-10s %-10s " >> + "%-10s %-10s %-10s\n", >> + "tsc", "cycles", "packets", "cycles/pkt", "pkts/batch", >> + "vhost qlen", "upcalls", "cycles/upcall"); >> + for (i = 1; i <= n_iter; i++) { >> + index = (s->iterations.idx + HISTORY_LEN - i) % HISTORY_LEN; >> + is = &s->iterations.sample[index]; >> + ds_put_format(str, >> + " %-17"PRIu64" %-11"PRIu64" %-11"PRIu32 >> + " %-11"PRIu64" %-11"PRIu32" %-11"PRIu32 >> + " %-11"PRIu32" %-11"PRIu32"\n", >> + is->timestamp, >> + is->cycles, >> + is->pkts, >> + is->pkts ? is->cycles / is->pkts : 0, >> + is->batches ? is->pkts / is->batches : 0, >> + is->max_vhost_qfill, >> + is->upcalls, >> + is->upcalls ? is->upcall_cycles / is->upcalls : 0); >> + } >> +} >> + >> +void >> +pmd_perf_format_ms_history(struct ds *str, struct pmd_perf_stats *s, int >> n_ms) >> +{ >> + struct iter_stats *is; >> + size_t index; >> + int i; >> + >> + if (n_ms == 0) { >> + return; >> + } >> + ds_put_format(str, >> + " %-12s %-10s %-10s %-10s %-10s" >> + " %-10s %-10s %-10s %-10s\n", >> + "ms", "iterations", "cycles/it", "Kpps", "cycles/pkt", >> + "pkts/batch", "vhost qlen", "upcalls", "cycles/upcall"); >> + for (i = 1; i <= n_ms; i++) { >> + index = (s->milliseconds.idx + HISTORY_LEN - i) % HISTORY_LEN; >> + is = &s->milliseconds.sample[index]; >> + ds_put_format(str, >> + " %-12"PRIu64" %-11"PRIu32" %-11"PRIu64 >> + " %-11"PRIu32" %-11"PRIu64" %-11"PRIu32 >> + " %-11"PRIu32" %-11"PRIu32" %-11"PRIu32"\n", >> + is->timestamp, >> + is->iterations, >> + is->iterations ? is->cycles / is->iterations : 0, >> + is->pkts, >> + is->pkts ? is->busy_cycles / is->pkts : 0, >> + is->batches ? is->pkts / is->batches : 0, >> + is->iterations >> + ? is->max_vhost_qfill / is->iterations : 0, >> + is->upcalls, >> + is->upcalls ? is->upcall_cycles / is->upcalls : 0); >> + } >> +} >> + >> +void >> pmd_perf_read_counters(struct pmd_perf_stats *s, >> uint64_t stats[PMD_N_STATS]) >> { >> @@ -59,8 +354,20 @@ void >> pmd_perf_stats_clear(struct pmd_perf_stats *s) >> { >> s->start_ms = 0; /* Marks start of clearing. */ >> + memset(&s->current, 0 , sizeof(struct iter_stats)); >> + memset(&s->totals, 0 , sizeof(struct iter_stats)); >> for (int i = 0; i < PMD_N_STATS; i++) { >> atomic_read_relaxed(&s->counters.n[i], &s->counters.zero[i]); >> } >> + histogram_clear(&s->cycles); >> + histogram_clear(&s->pkts); >> + histogram_clear(&s->cycles_per_pkt); >> + histogram_clear(&s->upcalls); >> + histogram_clear(&s->cycles_per_upcall); >> + histogram_clear(&s->pkts_per_batch); >> + histogram_clear(&s->max_vhost_qfill); >> + history_init(&s->iterations); >> + history_init(&s->milliseconds); >> s->start_ms = time_msec(); /* Clearing finished. */ >> + s->milliseconds.sample[0].timestamp = s->start_ms; >> } >> diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h >> index fa5cf37..3611723 100644 >> --- a/lib/dpif-netdev-perf.h >> +++ b/lib/dpif-netdev-perf.h >> @@ -33,6 +33,11 @@ >> extern "C" { >> #endif >> >> +#define NUM_BINS 32 /* Number of histogram bins. */ >> +#define HISTORY_LEN 1000 /* Length of recorded history >> + (iterations and ms). */ >> +#define DEF_HIST_SHOW 20 /* Default number of history samples. */ >> + >> enum pmd_stat_type { >> PMD_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */ >> PMD_STAT_MASKED_HIT, /* Packets that matched in the flow table. */ >> @@ -62,6 +67,28 @@ enum pmd_stat_type { >> PMD_N_STATS >> }; >> >> +struct histogram { >> + uint32_t wall[NUM_BINS]; >> + uint64_t bin[NUM_BINS]; >> +}; >> + >> +struct iter_stats { >> + uint64_t timestamp; >> + uint64_t cycles; >> + uint64_t busy_cycles; >> + uint32_t iterations; >> + uint32_t pkts; >> + uint32_t upcalls; >> + uint32_t upcall_cycles; >> + uint32_t batches; >> + uint32_t max_vhost_qfill; >> +}; >> + >> +struct history { >> + uint64_t idx; >> + struct iter_stats sample[HISTORY_LEN]; >> +}; >> + >> struct pmd_counters { >> atomic_uint64_t n[PMD_N_STATS]; /* Indexed by PMD_STAT_*. */ >> uint64_t zero[PMD_N_STATS]; >> @@ -71,6 +98,17 @@ struct pmd_perf_stats { >> uint64_t start_ms; >> uint64_t last_tsc; >> struct pmd_counters counters; >> + struct iter_stats current; >> + struct iter_stats totals; >> + struct histogram cycles; >> + struct histogram pkts; >> + struct histogram cycles_per_pkt; >> + struct histogram upcalls; >> + struct histogram cycles_per_upcall; >> + struct histogram pkts_per_batch; >> + struct histogram max_vhost_qfill; >> + struct history iterations; >> + struct history milliseconds; >> }; >> >> enum pmd_info_type; >> @@ -98,6 +136,63 @@ pmd_perf_update_counter(struct pmd_perf_stats *s, >> atomic_store_relaxed(&s->counters.n[counter], tmp); >> } >> >> +void pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s, >> + double duration); >> +void pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s); >> +void pmd_perf_format_iteration_history(struct ds *str, >> + struct pmd_perf_stats *s, >> + int n_iter); >> +void pmd_perf_format_ms_history(struct ds *str, struct pmd_perf_stats *s, >> + int n_ms); >> + >> +void pmd_perf_show(struct unixctl_conn *conn, int argc, >> + const char *argv[], >> + void *aux OVS_UNUSED); >> +static inline void >> +histogram_add_sample(struct histogram *hist, uint32_t val) >> +{ >> + /* TODO: Can do better with binary search? */ >> + for (int i = 0; i < NUM_BINS-1; i++) { >> + if (val <= hist->wall[i]) { >> + hist->bin[i]++; >> + return; >> + } >> + } >> + hist->bin[NUM_BINS-1]++; >> +} >> + >> +uint64_t histogram_samples(const struct histogram *hist); >> + >> +static inline struct iter_stats * >> +history_current(struct history *h) >> +{ >> + return &h->sample[h->idx]; >> +} >> + >> +static inline struct iter_stats * >> +history_next(struct history *h) >> +{ >> + struct iter_stats *next; >> + >> + h->idx++; >> + if (h->idx == HISTORY_LEN) { >> + h->idx = 0; >> + } >> + next = &h->sample[h->idx]; >> + memset(next, 0, sizeof(*next)); >> + return next; >> +} >> + >> +static inline struct iter_stats * >> +history_store(struct history *h, struct iter_stats *is) >> +{ >> + if (is) { >> + h->sample[h->idx] = *is; >> + } >> + /* Advance the history pointer */ >> + return history_next(h); >> +} >> + >> static inline void >> pmd_perf_start_iteration(struct pmd_perf_stats *s, uint64_t now_tsc) >> { >> @@ -105,14 +200,25 @@ pmd_perf_start_iteration(struct pmd_perf_stats *s, >> uint64_t now_tsc) >> /* Stats not yet fully initialised. */ >> return; >> } >> - s->last_tsc = now_tsc; >> + memset(&s->current, 0, sizeof(struct iter_stats)); >> + s->current.timestamp = now_tsc; >> } >> >> static inline void >> pmd_perf_end_iteration(struct pmd_perf_stats *s, uint64_t now_tsc, >> - int packets) >> + int packets, bool full_metrics) >> { >> - uint64_t cycles = now_tsc - s->last_tsc; >> + struct iter_stats *cum_ms; >> + uint64_t cycles, cycles_per_pkt = 0; >> + >> + if (OVS_UNLIKELY(s->current.timestamp == 0)) { >> + /* Stats were cleared during the ongoing iteration. */ >> + return; >> + } >> + >> + cycles = now_tsc - s->current.timestamp; >> + s->current.cycles = cycles; >> + s->current.pkts = packets; >> >> /* No need for atomic updates as only invoked within PMD thread. */ >> if (packets > 0) { >> @@ -120,7 +226,68 @@ pmd_perf_end_iteration(struct pmd_perf_stats *s, >> uint64_t now_tsc, >> } else { >> s->counters.n[PMD_CYCLES_ITER_IDLE] += cycles; >> } >> + /* Add iteration sample to histograms. */ >> + histogram_add_sample(&s->cycles, cycles); >> + histogram_add_sample(&s->pkts, packets); >> + >> + if (!full_metrics) { >> + return; >> + } >> + >> + s->counters.n[PMD_CYCLES_UPCALL] += s->current.upcall_cycles; >> + >> + if (packets > 0) { >> + cycles_per_pkt = cycles / packets; >> + histogram_add_sample(&s->cycles_per_pkt, cycles_per_pkt); >> + } >> + if (s->current.batches > 0) { >> + histogram_add_sample(&s->pkts_per_batch, packets / >> s->current.batches); >> + } >> + histogram_add_sample(&s->upcalls, s->current.upcalls); >> + if (s->current.upcalls > 0) { >> + histogram_add_sample(&s->cycles_per_upcall, >> + s->current.upcall_cycles / s->current.upcalls); >> + } >> + histogram_add_sample(&s->max_vhost_qfill, s->current.max_vhost_qfill); >> + >> + /* Add iteration samples to millisecond stats. */ >> + cum_ms = history_current(&s->milliseconds); >> + cum_ms->iterations++; >> + cum_ms->cycles += cycles; >> + if (packets > 0) { >> + cum_ms->busy_cycles += cycles; >> + } >> + cum_ms->pkts += s->current.pkts; >> + cum_ms->upcalls += s->current.upcalls; >> + cum_ms->upcall_cycles += s->current.upcall_cycles; >> + cum_ms->batches += s->current.batches; >> + cum_ms->max_vhost_qfill += s->current.max_vhost_qfill; >> + >> + /* Store in iteration history. */ >> + history_store(&s->iterations, &s->current); >> + if (now_tsc - s->last_tsc > 10000) { >> + /* Check if ms is completed and store in milliseconds history. */ >> + uint64_t now = time_msec(); >> + if (now != cum_ms->timestamp) { >> + /* Add ms stats to totals. */ >> + s->totals.iterations += cum_ms->iterations; >> + s->totals.cycles += cum_ms->cycles; >> + s->totals.busy_cycles += cum_ms->busy_cycles; >> + s->totals.pkts += cum_ms->pkts; >> + s->totals.upcalls += cum_ms->upcalls; >> + s->totals.upcall_cycles += cum_ms->upcall_cycles; >> + s->totals.batches += cum_ms->batches; >> + s->totals.max_vhost_qfill += cum_ms->max_vhost_qfill; >> + cum_ms = history_next(&s->milliseconds); >> + cum_ms->timestamp = now; >> + } >> + s->last_tsc = now_tsc; >> + } >> +} >> + >> +#ifdef __cplusplus >> } >> +#endif >> >> #ifdef __cplusplus >> } >> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c >> index 8b9698a..8f64df3 100644 >> --- a/lib/dpif-netdev.c >> +++ b/lib/dpif-netdev.c >> @@ -53,6 +53,7 @@ >> #include "id-pool.h" >> #include "latch.h" >> #include "netdev.h" >> +#include "netdev-provider.h" >> #include "netdev-vport.h" >> #include "netlink.h" >> #include "odp-execute.h" >> @@ -278,6 +279,8 @@ struct dp_netdev { >> >> /* Probability of EMC insertions is a factor of 'emc_insert_min'.*/ >> OVS_ALIGNED_VAR(CACHE_LINE_SIZE) atomic_uint32_t emc_insert_min; >> + /* Enable collection of PMD performance metrics. */ >> + ATOMIC(bool) pmd_perf_metrics; >> >> /* Protects access to ofproto-dpif-upcall interface during revalidator >> * thread synchronization. */ >> @@ -703,6 +706,7 @@ static inline bool emc_entry_alive(struct emc_entry *ce); >> static void emc_clear_entry(struct emc_entry *ce); >> >> static void dp_netdev_request_reconfigure(struct dp_netdev *dp); >> +static inline bool pmd_perf_metrics_enabled(const struct >> dp_netdev_pmd_thread *pmd); >> >> static void >> emc_cache_init(struct emc_cache *flow_cache) >> @@ -786,7 +790,8 @@ get_dp_netdev(const struct dpif *dpif) >> enum pmd_info_type { >> PMD_INFO_SHOW_STATS, /* Show how cpu cycles are spent. */ >> PMD_INFO_CLEAR_STATS, /* Set the cycles count to 0. */ >> - PMD_INFO_SHOW_RXQ /* Show poll-lists of pmd threads. */ >> + PMD_INFO_SHOW_RXQ, /* Show poll-lists of pmd threads. */ >> + PMD_INFO_PERF_SHOW, /* Show pmd performance details. */ >> }; >> >> static void >> @@ -873,8 +878,46 @@ pmd_info_show_stats(struct ds *reply, >> ds_put_format(reply, >> "\tavg processing cycles per packet: " >> "%.02f (%"PRIu64"/%"PRIu64")\n", >> - stats[PMD_CYCLES_POLL_BUSY] / (double)total_packets, >> - stats[PMD_CYCLES_POLL_BUSY], total_packets); >> + stats[PMD_CYCLES_ITER_BUSY] / (double)total_packets, >> + stats[PMD_CYCLES_ITER_BUSY], total_packets); >> +} >> + >> +static void >> +pmd_info_show_perf(struct ds *reply, >> + struct dp_netdev_pmd_thread *pmd, >> + struct pmd_perf_params *par) >> +{ >> + if (pmd->core_id != NON_PMD_CORE_ID) { >> + char *time_str = >> + xastrftime_msec("%H:%M:%S.###", time_wall_msec(), true); >> + long long now = time_msec(); >> + double duration = (now - pmd->perf_stats.start_ms) / 1000.0; >> + >> + ds_put_cstr(reply, "\n"); >> + ds_put_format(reply, "Time: %s\n", time_str); >> + ds_put_format(reply, "Measurement duration: %.3f s\n", duration); >> + ds_put_cstr(reply, "\n"); >> + format_pmd_thread(reply, pmd); >> + ds_put_cstr(reply, "\n"); >> + pmd_perf_format_overall_stats(reply, &pmd->perf_stats, duration); >> + if (pmd_perf_metrics_enabled(pmd)) { >> + if (par->histograms) { >> + ds_put_cstr(reply, "\n"); >> + pmd_perf_format_histograms(reply, &pmd->perf_stats); >> + } >> + if (par->iter_hist_len > 0) { >> + ds_put_cstr(reply, "\n"); >> + pmd_perf_format_iteration_history(reply, &pmd->perf_stats, >> + par->iter_hist_len); >> + } >> + if (par->ms_hist_len > 0) { >> + ds_put_cstr(reply, "\n"); >> + pmd_perf_format_ms_history(reply, &pmd->perf_stats, >> + par->ms_hist_len); >> + } >> + } >> + free(time_str); >> + } >> } >> >> static int >> @@ -1079,6 +1122,8 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int >> argc, const char *argv[], >> pmd_perf_stats_clear(&pmd->perf_stats); >> } else if (type == PMD_INFO_SHOW_STATS) { >> pmd_info_show_stats(&reply, pmd); >> + } else if (type == PMD_INFO_PERF_SHOW) { >> + pmd_info_show_perf(&reply, pmd, (struct pmd_perf_params *)aux); >> } >> } >> free(pmd_list); >> @@ -1088,6 +1133,48 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int >> argc, const char *argv[], >> unixctl_command_reply(conn, ds_cstr(&reply)); >> ds_destroy(&reply); >> } >> + >> +static void >> +pmd_perf_show_cmd(struct unixctl_conn *conn, int argc, >> + const char *argv[], >> + void *aux OVS_UNUSED) >> +{ >> + struct pmd_perf_params par; >> + long int it_hist = 0, ms_hist = 0; >> + par.histograms = true; >> + >> + while (argc > 1) { >> + if (!strcmp(argv[1], "-nh")) { >> + par.histograms = false; >> + argc -= 1; >> + argv += 1; >> + } else if (!strcmp(argv[1], "-it") && argc > 2) { >> + it_hist = strtol(argv[2], NULL, 10); >> + if (it_hist < 0) { >> + it_hist = 0; >> + } else if (it_hist > HISTORY_LEN) { >> + it_hist = HISTORY_LEN; >> + } >> + argc -= 2; >> + argv += 2; >> + } else if (!strcmp(argv[1], "-ms") && argc > 2) { >> + ms_hist = strtol(argv[2], NULL, 10); >> + if (ms_hist < 0) { >> + ms_hist = 0; >> + } else if (ms_hist > HISTORY_LEN) { >> + ms_hist = HISTORY_LEN; >> + } >> + argc -= 2; >> + argv += 2; >> + } else { >> + break; >> + } >> + } >> + par.iter_hist_len = it_hist; >> + par.ms_hist_len = ms_hist; >> + par.command_type = PMD_INFO_PERF_SHOW; >> + dpif_netdev_pmd_info(conn, argc, argv, &par); >> +} >> >> static int >> dpif_netdev_init(void) >> @@ -1105,6 +1192,12 @@ dpif_netdev_init(void) >> unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core | dp]", >> 0, 2, dpif_netdev_pmd_info, >> (void *)&poll_aux); >> + unixctl_command_register("dpif-netdev/pmd-perf-show", >> + "[-nh] [-it iter-history-len]" >> + " [-ms ms-history-len]" >> + " [-pmd core | dp]", >> + 0, 7, pmd_perf_show_cmd, >> + NULL); >> unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]", >> 0, 1, dpif_netdev_pmd_rebalance, >> NULL); >> @@ -2980,6 +3073,18 @@ dpif_netdev_set_config(struct dpif *dpif, const >> struct smap *other_config) >> } >> } >> >> + bool perf_enabled = smap_get_bool(other_config, "pmd-perf-metrics", >> false); >> + bool cur_perf_enabled; >> + atomic_read_relaxed(&dp->pmd_perf_metrics, &cur_perf_enabled); >> + if (perf_enabled != cur_perf_enabled) { >> + atomic_store_relaxed(&dp->pmd_perf_metrics, perf_enabled); >> + if (perf_enabled) { >> + VLOG_INFO("PMD performance metrics collection enabled"); >> + } else { >> + VLOG_INFO("PMD performance metrics collection disabled"); >> + } >> + } >> + >> return 0; >> } >> >> @@ -3160,6 +3265,14 @@ cycles_count_intermediate(struct dp_netdev_pmd_thread >> *pmd, >> } >> } >> >> +static inline bool >> +pmd_perf_metrics_enabled(const struct dp_netdev_pmd_thread *pmd) >> +{ >> + bool pmd_perf_enabled; >> + atomic_read_relaxed(&pmd->dp->pmd_perf_metrics, &pmd_perf_enabled); >> + return pmd_perf_enabled; >> +} >> + >> static void >> dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx, >> enum rxq_cycles_counter_type type, >> @@ -3236,6 +3349,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread >> *pmd, >> struct netdev_rxq *rx, >> odp_port_t port_no) >> { >> + struct pmd_perf_stats *s = &pmd->perf_stats; >> struct dp_packet_batch batch; >> int error; >> int batch_cnt = 0; >> @@ -3245,8 +3359,25 @@ dp_netdev_process_rxq_port(struct >> dp_netdev_pmd_thread *pmd, >> if (!error) { >> *recirc_depth_get() = 0; >> pmd_thread_ctx_time_update(pmd); >> - >> batch_cnt = batch.count; >> + if (pmd_perf_metrics_enabled(pmd)) { >> + /* Update batch histogram. */ >> + s->current.batches++; >> + histogram_add_sample(&s->pkts_per_batch, batch_cnt); >> + /* Update the maximum Rx queue fill level. */ >> + uint32_t qfill = batch.qfill; >> + switch (netdev_dpdk_get_type(netdev_rxq_get_netdev(rx))) { >> + case DPDK_DEV_VHOST: >> + if (qfill > s->current.max_vhost_qfill) { >> + s->current.max_vhost_qfill = qfill; >> + } >> + break; >> + case DPDK_DEV_ETH: >> + default: >> + break; >> + } > I'm sorry, but I'll vote to not accept changes like this. This is contraversal > to all the modular object-oriented architecure of OVS. > > In the worst case you may pass something from this layer to be filled by > netdev. > But I don't really like such approach too. > >> + } >> + /* Process packet batch. */ >> dp_netdev_input(pmd, &batch, port_no); >> dp_netdev_pmd_flush_output_packets(pmd); >> } else if (error != EAGAIN && error != EOPNOTSUPP) { >> @@ -4084,7 +4215,9 @@ reload: >> cycles_count_start(pmd); >> for (;;) { >> uint64_t iter_packets = 0; >> + >> pmd_perf_start_iteration(s, pmd->ctx.last_cycles); >> + >> for (i = 0; i < poll_cnt; i++) { >> process_packets = >> dp_netdev_process_rxq_port(pmd, poll_list[i].rxq->rx, >> @@ -4116,7 +4249,9 @@ reload: >> } >> cycles_count_intermediate(pmd, NULL, PMD_CYCLES_OVERHEAD); >> } >> - pmd_perf_end_iteration(s, pmd->ctx.last_cycles, iter_packets); >> + >> + pmd_perf_end_iteration(s, pmd->ctx.last_cycles, iter_packets, >> + pmd_perf_metrics_enabled(pmd)); >> } >> >> cycles_count_end(pmd, PMD_CYCLES_OVERHEAD); >> @@ -5012,6 +5147,7 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, >> struct match match; >> ovs_u128 ufid; >> int error; >> + uint64_t cycles = cycles_counter(); >> >> match.tun_md.valid = false; >> miniflow_expand(&key->mf, &match.flow); >> @@ -5065,9 +5201,15 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, >> ovs_mutex_unlock(&pmd->flow_mutex); >> emc_probabilistic_insert(pmd, key, netdev_flow); >> } >> - /* Only error ENOSPC can reach here. We process the packet but do not >> - * install a datapath flow. Treat as successful. */ >> - return 0; >> + if (pmd_perf_metrics_enabled(pmd)) { >> + /* Update upcall stats. */ >> + cycles = cycles_counter() - cycles; >> + struct pmd_perf_stats *s = &pmd->perf_stats; >> + s->current.upcalls++; >> + s->current.upcall_cycles += cycles; >> + histogram_add_sample(&s->cycles_per_upcall, cycles); > Hmm. Why this is done manually? Maybe PMD_CYCLES_UPCALL should be used here? > >> + } >> + return error; >> } >> >> static inline void >> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c >> index 364f545..d701621 100644 >> --- a/lib/netdev-dpdk.c >> +++ b/lib/netdev-dpdk.c >> @@ -36,6 +36,7 @@ >> #include <rte_mbuf.h> >> #include <rte_meter.h> >> #include <rte_pci.h> >> +#include <rte_version.h> >> #include <rte_vhost.h> >> #include <rte_version.h> >> >> @@ -188,11 +189,6 @@ enum { DPDK_RING_SIZE = 256 }; >> BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE)); >> enum { DRAIN_TSC = 200000ULL }; >> >> -enum dpdk_dev_type { >> - DPDK_DEV_ETH = 0, >> - DPDK_DEV_VHOST = 1, >> -}; >> - >> /* Quality of Service */ >> >> /* An instance of a QoS configuration. Always associated with a particular >> @@ -835,6 +831,13 @@ netdev_dpdk_cast(const struct netdev *netdev) >> return CONTAINER_OF(netdev, struct netdev_dpdk, up); >> } >> >> +enum dpdk_dev_type >> +netdev_dpdk_get_type(const struct netdev *netdev) >> +{ >> + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); >> + return dev->type; >> +} >> + >> static struct netdev * >> netdev_dpdk_alloc(void) >> { >> @@ -1644,6 +1647,14 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq, >> return EAGAIN; >> } >> >> + batch->qfill = nb_rx; >> + >> + if (OVS_UNLIKELY(nb_rx == NETDEV_MAX_BURST)) { >> + batch->qfill += rte_vhost_rx_queue_count(netdev_dpdk_get_vid(dev), >> + qid * VIRTIO_QNUM >> + + VIRTIO_TXQ); > This looks strange. > Isn't it illogical to spend addtitional time requesting queue count if > we're already late in packet processing?
And the most important thing at this point: 'netdev_dpdk_get_vid(dev)' without checking may lead to segmentation fault. See daf22bf7a826 ("netdev-dpdk: Fix calling vhost API with negative vid.") for details. > >> + } >> + >> if (policer) { >> dropped = nb_rx; >> nb_rx = ingress_policer_run(policer, >> @@ -1683,6 +1694,8 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct >> dp_packet_batch *batch) >> return EAGAIN; >> } >> >> + batch->qfill = nb_rx; >> + >> if (policer) { >> dropped = nb_rx; >> nb_rx = ingress_policer_run(policer, >> diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h >> index b7d02a7..2b357db 100644 >> --- a/lib/netdev-dpdk.h >> +++ b/lib/netdev-dpdk.h >> @@ -22,11 +22,18 @@ >> #include "openvswitch/compiler.h" >> >> struct dp_packet; >> +struct netdev; >> + >> +enum dpdk_dev_type { >> + DPDK_DEV_ETH = 0, >> + DPDK_DEV_VHOST = 1, >> +}; >> >> #ifdef DPDK_NETDEV >> >> void netdev_dpdk_register(void); >> void free_dpdk_buf(struct dp_packet *); >> +enum dpdk_dev_type netdev_dpdk_get_type(const struct netdev *netdev); >> >> #else >> >> @@ -41,6 +48,13 @@ free_dpdk_buf(struct dp_packet *buf OVS_UNUSED) >> /* Nothing */ >> } >> >> +static inline enum dpdk_dev_type >> +netdev_dpdk_get_type(const struct netdev *netdev OVS_UNUSED) >> +{ >> + /* Nothing to do. Return value zero to make compiler happy. */ >> + return DPDK_DEV_ETH; >> +} >> + >> #endif >> >> #endif /* netdev-dpdk.h */ >> diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c >> index 43b7b89..3845b12 100644 >> --- a/ofproto/ofproto-dpif.c >> +++ b/ofproto/ofproto-dpif.c >> @@ -5317,7 +5317,8 @@ dpif_show_backer(const struct dpif_backer *backer, >> struct ds *ds) >> >> dpif_get_dp_stats(backer->dpif, &dp_stats); >> ds_put_format(ds, "%s: hit:%"PRIu64" missed:%"PRIu64"\n", >> - dpif_name(backer->dpif), dp_stats.n_hit, >> dp_stats.n_missed); >> + dpif_name(backer->dpif), >> + dp_stats.n_hit + dp_stats.n_mask_hit, dp_stats.n_missed); >> >> shash_init(&ofproto_shash); >> ofprotos = get_ofprotos(&ofproto_shash); >> > > _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev