Hi Ilya, Thanks for your reply. We could have a look at test results in x86 by then. I can understand patch 1054571. If both patches are apply to master, I could rebase prefetch EMC patch for it. Mandatory hash computing in the ingress makes logic simple a lot, and it only costs a small price even in worst case(EMC/SMC disable & no hash-feature/load balance enable).
Best Regards, Wei Yanqin -----Original Message----- From: Ilya Maximets <i.maxim...@samsung.com> Sent: Friday, March 22, 2019 9:12 PM To: Yanqin Wei (Arm Technology China) <yanqin....@arm.com>; d...@openvswitch.org; ian.sto...@intel.com Cc: nd <n...@arm.com> Subject: Re: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by prefetching EMC entry. On 22.03.2019 11:44, Yanqin Wei (Arm Technology China) wrote: > Hi , OVS Maintainers, > > Could you help to have a look at this patch? Thanks a lot. Hi. Thanks for improving performance and sorry for delay. Review process here in OVS is a bit slow due to lack of reviewers. I have a plan to test this patch a bit on a next week. Want to check the performance impact on PVP cases on x86. BTW, I have a patch that affects same code. Maybe it'll be interesting to you: https://patchwork.ozlabs.org/patch/1054571/ Best regards, Ilya Maximets. > > Best Regards, > Wei Yanqin > > -----Original Message----- > From: Yanqin Wei <yanqin....@arm.com> > Sent: Wednesday, March 13, 2019 1:28 PM > To: d...@openvswitch.org > Cc: nd <n...@arm.com>; Gavin Hu (Arm Technology China) > <gavin...@arm.com>; Yanqin Wei (Arm Technology China) > <yanqin....@arm.com> > Subject: [ovs-dev][PATCH v3] dpif-netdev: dfc_process optimization by > prefetching EMC entry. > > It is observed that the throughput of multi-flow is worse than single-flow in > the EMC NIC to NIC cases. It is because CPU cache-miss increasing in EMC > lookup. Each flow need load at least one EMC entry to CPU cache(several cache > lines) and compare it with packet miniflow. > This patch improve it by prefetching EMC entry in advance. Hash value > can be obtained from dpdk rss hash, so this step can be advanced ahead > of > miniflow_extract() and prefetch EMC entry there. The prefetching size is > defined as ROUND_UP(128,CACHE_LINE_SIZE), which can cover majority traffic > including TCP/UDP protocol and need 2 cache lines in most modern CPU. > Performance test was run in some arm platform. 1000/10000 flows NIC2NIC test > achieved around 10% throughput improvement in thunderX2(aarch64 platform). > > Signed-off-by: Yanqin Wei <yanqin....@arm.com> > Reviewed-by: Gavin Hu <gavin...@arm.com> > --- > lib/dpif-netdev.c | 80 > ++++++++++++++++++++++++++++++++++++------------------- > 1 file changed, 52 insertions(+), 28 deletions(-) > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index > 4d6d0c3..982082c 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -189,6 +189,10 @@ struct netdev_flow_key { > #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX / \ > DEFAULT_EM_FLOW_INSERT_INV_PROB) > > +/* DEFAULT_EMC_PREFETCH_SIZE can cover majority traffic including > +TCP/UDP > + * protocol. */ > +#define DEFAULT_EMC_PREFETCH_SIZE ROUND_UP(128,CACHE_LINE_SIZE) > + > struct emc_entry { > struct dp_netdev_flow *flow; > struct netdev_flow_key key; /* key.hash used for emc hash value. */ > @@ -6166,15 +6170,20 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread > *pmd, struct dp_packet *packet_, } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_packet_rss_hash(struct dp_packet *packet, > + bool md_is_valid) > { > - uint32_t hash; > + uint32_t hash,recirc_depth; > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > + hash = dp_packet_get_rss_hash(packet); > + > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { > + hash = hash_finish(hash, recirc_depth); > + } > dp_packet_set_rss_hash(packet, hash); > } > > @@ -6182,24 +6191,23 @@ > dpif_netdev_packet_get_rss_hash_orig_pkt(struct dp_packet *packet, } > > static inline uint32_t > -dpif_netdev_packet_get_rss_hash(struct dp_packet *packet, > - const struct miniflow *mf) > +dpif_netdev_packet_get_hash_5tuple(struct dp_packet *packet, > + const struct miniflow *mf, > + bool md_is_valid) > { > - uint32_t hash, recirc_depth; > + uint32_t hash,recirc_depth; > > - if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > - hash = dp_packet_get_rss_hash(packet); > - } else { > - hash = miniflow_hash_5tuple(mf, 0); > - dp_packet_set_rss_hash(packet, hash); > - } > + hash = miniflow_hash_5tuple(mf, 0); > + dp_packet_set_rss_hash(packet, hash); > > - /* The RSS hash must account for the recirculation depth to avoid > - * collisions in the exact match cache */ > - recirc_depth = *recirc_depth_get_unsafe(); > - if (OVS_UNLIKELY(recirc_depth)) { > - hash = hash_finish(hash, recirc_depth); > - dp_packet_set_rss_hash(packet, hash); > + if (md_is_valid) { > + /* The RSS hash must account for the recirculation depth to avoid > + * collisions in the exact match cache */ > + recirc_depth = *recirc_depth_get_unsafe(); > + if (OVS_UNLIKELY(recirc_depth)) { > + hash = hash_finish(hash, recirc_depth); > + dp_packet_set_rss_hash(packet, hash); > + } > } > return hash; > } > @@ -6390,6 +6398,7 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > bool smc_enable_db; > size_t map_cnt = 0; > bool batch_enable = true; > + bool is_5tuple_hash_needed; > > atomic_read_relaxed(&pmd->dp->smc_enable_db, &smc_enable_db); > pmd_perf_update_counter(&pmd->perf_stats, > @@ -6436,16 +6445,31 @@ dfc_processing(struct dp_netdev_pmd_thread *pmd, > } > } > > - miniflow_extract(packet, &key->mf); > - key->len = 0; /* Not computed yet. */ > /* If EMC and SMC disabled skip hash computation */ > if (smc_enable_db == true || cur_min != 0) { > - if (!md_is_valid) { > - key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, > - &key->mf); > + if (OVS_LIKELY(dp_packet_rss_valid(packet))) { > + is_5tuple_hash_needed = false; > + key->hash = > + > dpif_netdev_packet_get_packet_rss_hash(packet,md_is_valid); > + if (cur_min) { > + ovs_prefetch_range( > + &cache->emc_cache.entries[key->hash & > EM_FLOW_HASH_MASK], > + DEFAULT_EMC_PREFETCH_SIZE); > + } > } else { > - key->hash = dpif_netdev_packet_get_rss_hash(packet, > &key->mf); > + is_5tuple_hash_needed = true; > } > + } else { > + is_5tuple_hash_needed = false; > + } > + > + miniflow_extract(packet, &key->mf); > + key->len = 0; /* Not computed yet. */ > + > + /* If 5tuple hash is needed */ > + if (is_5tuple_hash_needed) { > + key->hash = dpif_netdev_packet_get_hash_5tuple(packet, > + &key->mf, > + > + md_is_valid); > } > if (cur_min) { > flow = emc_lookup(&cache->emc_cache, key); > -- > 2.7.4 > > > _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev