>From: Ilya Maximets [mailto:i.maxim...@samsung.com] >Sent: Monday, June 5, 2017 9:18 AM >To: Kavanagh, Mark B <mark.b.kavan...@intel.com>; d...@openvswitch.org; >Darrell Ball ><db...@vmware.com> >Cc: Heetae Ahn <heetae82....@samsung.com>; Daniele Di Proietto ><diproiet...@ovn.org>; Ben >Pfaff <b...@ovn.org>; Pravin Shelar <pshe...@ovn.org>; Loftus, Ciara ><ciara.lof...@intel.com>; >Stokes, Ian <ian.sto...@intel.com>; Kevin Traynor <ktray...@redhat.com> >Subject: Re: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental >addition/deletion of PMD >threads. > >Thanks for testing, Mark. > >Comments inline. > >Best regards, Ilya Maximets. > >On 01.06.2017 17:30, Kavanagh, Mark B wrote: >>> From: ovs-dev-boun...@openvswitch.org >>> [mailto:ovs-dev-boun...@openvswitch.org] On Behalf >Of >>> Ilya Maximets >>> Sent: Tuesday, May 30, 2017 3:12 PM >>> To: d...@openvswitch.org; Darrell Ball <db...@vmware.com> >>> Cc: Heetae Ahn <heetae82....@samsung.com>; Ilya Maximets >>> <i.maxim...@samsung.com> >>> Subject: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental >>> addition/deletion of PMD >threads. >>> >>> Currently, change of 'pmd-cpu-mask' is very heavy operation. >>> It requires destroying of all the PMD threads and creating >>> them back. After that, all the threads will sleep until >>> ports' redistribution finished. >>> >>> This patch adds ability to not stop the datapath while >>> adjusting number/placement of PMD threads. All not affected >>> threads will forward traffic without any additional latencies. >>> >>> id-pool created for static tx queue ids to keep them sequential >>> in a flexible way. non-PMD thread will always have >>> static_tx_qid = 0 as it was before. >>> >>> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com> >> >> Hi Ilya, >> >> Patch LGTM - just once comment inline. >> >> I conducted some light testing and observed the following: >> - temporary dip in forwarding rate (~50%) when modifying pmd-cpu-mask >> (in previous >implementation, forwarding stopped completely) >> - occasional drop to zero in forwarding rate when modifying >> pmd-cpu-mask (e.g. >change from '2' to '12') > >I guess, drop to zero in this case could be fixed by second >patch of the series. (If it caused by port reconfiguration)
Yes, that's true - my colleague, Ian Stokes, will review and test that patch shortly. > >> - temporary drop of ~1mpps when adding/deleting port (similar to existing >implementation) >> >> I've also conducted the following sanity checks on the patch, and found no >> issues: >> - checkpatch.py >> - compilation with clang >> - compilation with gcc >> - sparse >> - make check (no additional tests fail after application of this patch) >> >> Tested-by: Mark Kavanagh <mark.b.kavan...@intel.com> >> >> Thanks, >> Mark >> >>> --- >>> lib/dpif-netdev.c | 143 >>> +++++++++++++++++++++++++++++++++++++++--------------- >>> tests/pmd.at | 2 +- >>> 2 files changed, 105 insertions(+), 40 deletions(-) >>> >>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c >>> index b50164b..596d133 100644 >>> --- a/lib/dpif-netdev.c >>> +++ b/lib/dpif-netdev.c >>> @@ -48,6 +48,7 @@ >>> #include "fat-rwlock.h" >>> #include "flow.h" >>> #include "hmapx.h" >>> +#include "id-pool.h" >>> #include "latch.h" >>> #include "netdev.h" >>> #include "netdev-vport.h" >>> @@ -277,6 +278,9 @@ struct dp_netdev { >>> >>> /* Stores all 'struct dp_netdev_pmd_thread's. */ >>> struct cmap poll_threads; >>> + /* id pool for per thread static_tx_qid. */ >>> + struct id_pool *tx_qid_pool; >>> + struct ovs_mutex tx_qid_pool_mutex; >>> >>> /* Protects the access of the 'struct dp_netdev_pmd_thread' >>> * instance for non-pmd thread. */ >>> @@ -562,7 +566,7 @@ struct dp_netdev_pmd_thread { >>> /* Queue id used by this pmd thread to send packets on all netdevs if >>> * XPS disabled for this netdev. All static_tx_qid's are unique and less >>> * than 'cmap_count(dp->poll_threads)'. */ >>> - const int static_tx_qid; >>> + uint32_t static_tx_qid; >>> >>> struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and >>> 'tx_ports'. */ >>> /* List of rx queues to poll. */ >>> @@ -642,6 +646,8 @@ static struct dp_netdev_pmd_thread >>> *dp_netdev_get_pmd(struct dp_netdev >>> *dp, >>> unsigned core_id); >>> static struct dp_netdev_pmd_thread * >>> dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos); >>> +static void dp_netdev_del_pmd(struct dp_netdev *dp, >>> + struct dp_netdev_pmd_thread *pmd); >>> static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd); >>> static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd); >>> static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, >>> @@ -1181,10 +1187,17 @@ create_dp_netdev(const char *name, const struct >>> dpif_class *class, >>> atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN); >>> >>> cmap_init(&dp->poll_threads); >>> + >>> + ovs_mutex_init(&dp->tx_qid_pool_mutex); >>> + /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. >>> */ >>> + dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1); >>> + >>> ovs_mutex_init_recursive(&dp->non_pmd_mutex); >>> ovsthread_key_create(&dp->per_pmd_key, NULL); >>> >>> ovs_mutex_lock(&dp->port_mutex); >>> + /* non-PMD will be created before all other threads and will >>> + * allocate static_tx_qid = 0. */ >>> dp_netdev_set_nonpmd(dp); >>> >>> error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class, >>> @@ -1279,6 +1292,9 @@ dp_netdev_free(struct dp_netdev *dp) >>> dp_netdev_destroy_all_pmds(dp, true); >>> cmap_destroy(&dp->poll_threads); >>> >>> + ovs_mutex_destroy(&dp->tx_qid_pool_mutex); >>> + id_pool_destroy(dp->tx_qid_pool); >>> + >>> ovs_mutex_destroy(&dp->non_pmd_mutex); >>> ovsthread_key_delete(dp->per_pmd_key); >>> >>> @@ -3302,12 +3318,29 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) >>> OVS_REQUIRES(dp- >>>> port_mutex) >>> } >>> >>> static void >>> +reload_affected_pmds(struct dp_netdev *dp) >>> +{ >>> + struct dp_netdev_pmd_thread *pmd; >>> + >>> + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { >>> + if (pmd->need_reload) { >>> + dp_netdev_reload_pmd__(pmd); >>> + pmd->need_reload = false; >>> + } >>> + } >>> +} >>> + >>> +static void >>> reconfigure_pmd_threads(struct dp_netdev *dp) >>> OVS_REQUIRES(dp->port_mutex) >>> { >>> struct dp_netdev_pmd_thread *pmd; >>> struct ovs_numa_dump *pmd_cores; >>> + struct ovs_numa_info_core *core; >>> + struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete); >>> + struct hmapx_node *node; >>> bool changed = false; >>> + bool need_to_adjust_static_tx_qids = false; >>> >>> /* The pmd threads should be started only if there's a pmd port in the >>> * datapath. If the user didn't provide any "pmd-cpu-mask", we start >>> @@ -3320,40 +3353,61 @@ reconfigure_pmd_threads(struct dp_netdev *dp) >>> pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS); >>> } >>> >>> - /* Check for changed configuration */ >>> - if (ovs_numa_dump_count(pmd_cores) != cmap_count(&dp->poll_threads) - >>> 1) { >>> - changed = true; >>> - } else { >>> - CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { >>> - if (pmd->core_id != NON_PMD_CORE_ID >>> - && !ovs_numa_dump_contains_core(pmd_cores, >>> - pmd->numa_id, >>> - pmd->core_id)) { >>> - changed = true; >>> - break; >>> - } >>> + /* We need to adjust 'static_tx_qid's only if we're reducing number of >>> + * PMD threads. Otherwise, new threads will allocate all the freed >>> ids. */ >> >> Can you elaborate on why the static_tx_qids need to be adjusted - it's still >> not clear to >me from the comment. > >I think, currently, the main case for keeping tx queue ids sequential >is the vhost-user ports with big number of queues but disabled by >driver in guest. > >If vhost-user device has enough number of queues to not use dynamic >distribution (XPS) and some of queues will be disabled inside the >guest, we will face almost same issue as described in >347ba9bb9b7a ("dpif-netdev: Unique and sequential tx_qids.") > >For example: > > We may have 4 threads and vhost-user port with 4 queues. > Lets assume that static tx qids distributed as follows: > thread #0 --> queue #0 > thread #1 --> queue #1 > thread #2 --> queue #3 > thread #3 --> queue #2 > Now guest disables queue #3 (ethtool -L eth0 combined 3) > And we're manually changing pmd-cpu-mask to remove thread #3. > (You need to be sure that threads wasn't reloaded here.) > At this point, if tx queue ids wasn't adjusted, we will have: > thread #0 --> queue #0 --> queue #0 > thread #1 --> queue #1 --> queue #1 > thread #2 --> queue #3 --> queue #0 > Where the last column is the result of remapping inside > __netdev_dpdk_vhost_send(). > So, in this case, there will be 3 queues available, but > only 2 used by 3 threads. --> unused queue #2 and locking > on access to queue #0. Okay, I got it - thanks for the detailed explanation Ilya. With that, Acked-by: Mark Kavanagh <mark.b.kavan...@intel.com> > >>> + if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - >>> 1) { >>> + need_to_adjust_static_tx_qids = true; >>> + } >>> + >>> + /* Check for unwanted pmd threads */ >>> + CMAP_FOR_EACH(pmd, node, &dp->poll_threads) { >>> + if (pmd->core_id == NON_PMD_CORE_ID) { >>> + continue; >>> + } >>> + if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id, >>> + pmd->core_id)) { >>> + hmapx_add(&to_delete, pmd); >>> + } else if (need_to_adjust_static_tx_qids) { >>> + pmd->need_reload = true; >>> } >>> } >>> >>> - /* Destroy the old and recreate the new pmd threads. We don't perform >>> an >>> - * incremental update because we would have to adjust 'static_tx_qid'. >>> */ >>> - if (changed) { >>> - struct ovs_numa_info_core *core; >>> - struct ovs_numa_info_numa *numa; >>> + HMAPX_FOR_EACH (node, &to_delete) { >>> + pmd = (struct dp_netdev_pmd_thread *) node->data; >>> + VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.", >>> + pmd->numa_id, pmd->core_id); >>> + dp_netdev_del_pmd(dp, pmd); >>> + } >>> + changed = !hmapx_is_empty(&to_delete); >>> + hmapx_destroy(&to_delete); >>> >>> - /* Do not destroy the non pmd thread. */ >>> - dp_netdev_destroy_all_pmds(dp, false); >>> - FOR_EACH_CORE_ON_DUMP (core, pmd_cores) { >>> - struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd); >>> + if (need_to_adjust_static_tx_qids) { >>> + /* 'static_tx_qid's are not sequential now. >>> + * Reload remaining threads to fix this. */ >>> + reload_affected_pmds(dp); >>> + } >>> >>> + /* Check for required new pmd threads */ >>> + FOR_EACH_CORE_ON_DUMP(core, pmd_cores) { >>> + pmd = dp_netdev_get_pmd(dp, core->core_id); >>> + if (!pmd) { >>> + pmd = xzalloc(sizeof *pmd); >>> dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id); >>> - >>> pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd); >>> + VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.", >>> + pmd->numa_id, pmd->core_id); >>> + changed = true; >>> + } else { >>> + dp_netdev_pmd_unref(pmd); >>> } >>> + } >>> + >>> + if (changed) { >>> + struct ovs_numa_info_numa *numa; >>> >>> /* Log the number of pmd threads per numa node. */ >>> FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) { >>> - VLOG_INFO("Created %"PRIuSIZE" pmd threads on numa node %d", >>> + VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d", >>> numa->n_cores, numa->numa_id); >>> } >>> } >>> @@ -3362,19 +3416,6 @@ reconfigure_pmd_threads(struct dp_netdev *dp) >>> } >>> >>> static void >>> -reload_affected_pmds(struct dp_netdev *dp) >>> -{ >>> - struct dp_netdev_pmd_thread *pmd; >>> - >>> - CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { >>> - if (pmd->need_reload) { >>> - dp_netdev_reload_pmd__(pmd); >>> - pmd->need_reload = false; >>> - } >>> - } >>> -} >>> - >>> -static void >>> pmd_remove_stale_ports(struct dp_netdev *dp, >>> struct dp_netdev_pmd_thread *pmd) >>> OVS_EXCLUDED(pmd->port_mutex) >>> @@ -3673,6 +3714,28 @@ pmd_load_cached_ports(struct dp_netdev_pmd_thread >>> *pmd) >>> } >>> } >>> >>> +static void >>> +pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd) >>> +{ >>> + ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex); >>> + if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) { >>> + VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d" >>> + ", numa_id %d.", pmd->core_id, pmd->numa_id); >>> + } >>> + ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex); >>> + >>> + VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d" >>> + ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, >>> pmd->numa_id); >>> +} >>> + >>> +static void >>> +pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd) >>> +{ >>> + ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex); >>> + id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid); >>> + ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex); >>> +} >>> + >>> static int >>> pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd, >>> struct polled_queue **ppoll_list) >>> @@ -3718,6 +3781,7 @@ pmd_thread_main(void *f_) >>> dpdk_set_lcore_id(pmd->core_id); >>> poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); >>> reload: >>> + pmd_alloc_static_tx_qid(pmd); >>> emc_cache_init(&pmd->flow_cache); >>> >>> /* List port/core affinity */ >>> @@ -3766,6 +3830,7 @@ reload: >>> dp_netdev_pmd_reload_done(pmd); >>> >>> emc_cache_uninit(&pmd->flow_cache); >>> + pmd_free_static_tx_qid(pmd); >>> >>> if (!exiting) { >>> goto reload; >>> @@ -4176,8 +4241,6 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread >>> *pmd, struct >>> dp_netdev *dp, >>> pmd->numa_id = numa_id; >>> pmd->need_reload = false; >>> >>> - *CONST_CAST(int *, &pmd->static_tx_qid) = >>> cmap_count(&dp->poll_threads); >>> - >>> ovs_refcount_init(&pmd->ref_cnt); >>> latch_init(&pmd->exit_latch); >>> pmd->reload_seq = seq_create(); >>> @@ -4198,6 +4261,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread >>> *pmd, struct >>> dp_netdev *dp, >>> * actual thread created for NON_PMD_CORE_ID. */ >>> if (core_id == NON_PMD_CORE_ID) { >>> emc_cache_init(&pmd->flow_cache); >>> + pmd_alloc_static_tx_qid(pmd); >>> } >>> cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, >>> &pmd->node), >>> hash_int(core_id, 0)); >>> @@ -4240,6 +4304,7 @@ dp_netdev_del_pmd(struct dp_netdev *dp, struct >>> dp_netdev_pmd_thread >>> *pmd) >>> ovs_mutex_lock(&dp->non_pmd_mutex); >>> emc_cache_uninit(&pmd->flow_cache); >>> pmd_free_cached_ports(pmd); >>> + pmd_free_static_tx_qid(pmd); >>> ovs_mutex_unlock(&dp->non_pmd_mutex); >>> } else { >>> latch_set(&pmd->exit_latch); >>> diff --git a/tests/pmd.at b/tests/pmd.at >>> index 05755b3..39a3645 100644 >>> --- a/tests/pmd.at >>> +++ b/tests/pmd.at >>> @@ -38,7 +38,7 @@ dnl >>> dnl Whaits for creation of 'n_threads' or at least 1 thread if $1 not >>> dnl passed. Checking starts from line number 'line' in ovs-vswithd.log . >>> m4_define([CHECK_PMD_THREADS_CREATED], [ >>> - PATTERN="Created [[0-9]]* pmd threads on numa node $2" >>> + PATTERN="There are [[0-9]]* pmd threads on numa node $2" >>> line_st=$3 >>> if [[ -z "$line_st" ]] >>> then >>> -- >>> 2.7.4 >>> >>> _______________________________________________ >>> dev mailing list >>> d...@openvswitch.org >>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev >> >> >> _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev