>From: Ilya Maximets [mailto:i.maxim...@samsung.com] >Sent: Thursday, June 15, 2017 11:07 AM >To: Kavanagh, Mark B <mark.b.kavan...@intel.com>; d...@openvswitch.org; >Darrell Ball ><db...@vmware.com> >Cc: Heetae Ahn <heetae82....@samsung.com>; Daniele Di Proietto ><diproiet...@ovn.org>; Ben >Pfaff <b...@ovn.org>; Pravin Shelar <pshe...@ovn.org>; Loftus, Ciara ><ciara.lof...@intel.com>; >Stokes, Ian <ian.sto...@intel.com>; Kevin Traynor <ktray...@redhat.com> >Subject: Re: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental >addition/deletion of PMD >threads. > >On 06.06.2017 13:02, Kavanagh, Mark B wrote: >> >> >>> From: Ilya Maximets [mailto:i.maxim...@samsung.com] >>> Sent: Monday, June 5, 2017 9:18 AM >>> To: Kavanagh, Mark B <mark.b.kavan...@intel.com>; d...@openvswitch.org; >>> Darrell Ball >>> <db...@vmware.com> >>> Cc: Heetae Ahn <heetae82....@samsung.com>; Daniele Di Proietto >>> <diproiet...@ovn.org>; Ben >>> Pfaff <b...@ovn.org>; Pravin Shelar <pshe...@ovn.org>; Loftus, Ciara ><ciara.lof...@intel.com>; >>> Stokes, Ian <ian.sto...@intel.com>; Kevin Traynor <ktray...@redhat.com> >>> Subject: Re: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental >>> addition/deletion of PMD >>> threads. >>> >>> Thanks for testing, Mark. >>> >>> Comments inline. >>> >>> Best regards, Ilya Maximets. >>> >>> On 01.06.2017 17:30, Kavanagh, Mark B wrote: >>>>> From: ovs-dev-boun...@openvswitch.org >>>>> [mailto:ovs-dev-boun...@openvswitch.org] On Behalf >>> Of >>>>> Ilya Maximets >>>>> Sent: Tuesday, May 30, 2017 3:12 PM >>>>> To: d...@openvswitch.org; Darrell Ball <db...@vmware.com> >>>>> Cc: Heetae Ahn <heetae82....@samsung.com>; Ilya Maximets >>>>> <i.maxim...@samsung.com> >>>>> Subject: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental >>>>> addition/deletion of PMD >>> threads. >>>>> >>>>> Currently, change of 'pmd-cpu-mask' is very heavy operation. >>>>> It requires destroying of all the PMD threads and creating >>>>> them back. After that, all the threads will sleep until >>>>> ports' redistribution finished. >>>>> >>>>> This patch adds ability to not stop the datapath while >>>>> adjusting number/placement of PMD threads. All not affected >>>>> threads will forward traffic without any additional latencies. >>>>> >>>>> id-pool created for static tx queue ids to keep them sequential >>>>> in a flexible way. non-PMD thread will always have >>>>> static_tx_qid = 0 as it was before. >>>>> >>>>> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com> >>>> >>>> Hi Ilya, >>>> >>>> Patch LGTM - just once comment inline. >>>> >>>> I conducted some light testing and observed the following: >>>> - temporary dip in forwarding rate (~50%) when modifying pmd-cpu-mask >>>> (in previous >>> implementation, forwarding stopped completely) >>>> - occasional drop to zero in forwarding rate when modifying >>>> pmd-cpu-mask (e.g. >>> change from '2' to '12') >>> >>> I guess, drop to zero in this case could be fixed by second >>> patch of the series. (If it caused by port reconfiguration) >> >> Yes, that's true - my colleague, Ian Stokes, will review and test that patch >> shortly. >> >>> >>>> - temporary drop of ~1mpps when adding/deleting port (similar to >>>> existing >>> implementation) >>>> >>>> I've also conducted the following sanity checks on the patch, and found no >>>> issues: >>>> - checkpatch.py >>>> - compilation with clang >>>> - compilation with gcc >>>> - sparse >>>> - make check (no additional tests fail after application of this patch) >>>> >>>> Tested-by: Mark Kavanagh <mark.b.kavan...@intel.com> >>>> >>>> Thanks, >>>> Mark >>>> >>>>> --- >>>>> lib/dpif-netdev.c | 143 >>>>> +++++++++++++++++++++++++++++++++++++++--------------- >>>>> tests/pmd.at | 2 +- >>>>> 2 files changed, 105 insertions(+), 40 deletions(-) >>>>> >>>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c >>>>> index b50164b..596d133 100644 >>>>> --- a/lib/dpif-netdev.c >>>>> +++ b/lib/dpif-netdev.c >>>>> @@ -48,6 +48,7 @@ >>>>> #include "fat-rwlock.h" >>>>> #include "flow.h" >>>>> #include "hmapx.h" >>>>> +#include "id-pool.h" >>>>> #include "latch.h" >>>>> #include "netdev.h" >>>>> #include "netdev-vport.h" >>>>> @@ -277,6 +278,9 @@ struct dp_netdev { >>>>> >>>>> /* Stores all 'struct dp_netdev_pmd_thread's. */ >>>>> struct cmap poll_threads; >>>>> + /* id pool for per thread static_tx_qid. */ >>>>> + struct id_pool *tx_qid_pool; >>>>> + struct ovs_mutex tx_qid_pool_mutex; >>>>> >>>>> /* Protects the access of the 'struct dp_netdev_pmd_thread' >>>>> * instance for non-pmd thread. */ >>>>> @@ -562,7 +566,7 @@ struct dp_netdev_pmd_thread { >>>>> /* Queue id used by this pmd thread to send packets on all netdevs if >>>>> * XPS disabled for this netdev. All static_tx_qid's are unique and >>>>> less >>>>> * than 'cmap_count(dp->poll_threads)'. */ >>>>> - const int static_tx_qid; >>>>> + uint32_t static_tx_qid; >>>>> >>>>> struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and >>>>> 'tx_ports'. */ >>>>> /* List of rx queues to poll. */ >>>>> @@ -642,6 +646,8 @@ static struct dp_netdev_pmd_thread >>>>> *dp_netdev_get_pmd(struct >dp_netdev >>>>> *dp, >>>>> unsigned core_id); >>>>> static struct dp_netdev_pmd_thread * >>>>> dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos); >>>> +static void dp_netdev_del_pmd(struct dp_netdev *dp, >>>>> + struct dp_netdev_pmd_thread *pmd); >>>>> static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool >>>>> non_pmd); >>>>> static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd); >>>>> static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, >>>>> @@ -1181,10 +1187,17 @@ create_dp_netdev(const char *name, const struct >>>>> dpif_class >*class, >>>>> atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN); >>>>> >>>>> cmap_init(&dp->poll_threads); >>>>> + >>>>> + ovs_mutex_init(&dp->tx_qid_pool_mutex); >>>>> + /* We need 1 Tx queue for each possible core + 1 for non-PMD >>>>> threads. */ >>>>> + dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1); >>>>> + >>>>> ovs_mutex_init_recursive(&dp->non_pmd_mutex); >>>>> ovsthread_key_create(&dp->per_pmd_key, NULL); >>>>> >>>>> ovs_mutex_lock(&dp->port_mutex); >>>>> + /* non-PMD will be created before all other threads and will >>>>> + * allocate static_tx_qid = 0. */ >>>>> dp_netdev_set_nonpmd(dp); >>>>> >>>>> error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class, >>>>> @@ -1279,6 +1292,9 @@ dp_netdev_free(struct dp_netdev *dp) >>>>> dp_netdev_destroy_all_pmds(dp, true); >>>>> cmap_destroy(&dp->poll_threads); >>>>> >>>>> + ovs_mutex_destroy(&dp->tx_qid_pool_mutex); >>>>> + id_pool_destroy(dp->tx_qid_pool); >>>>> + >>>>> ovs_mutex_destroy(&dp->non_pmd_mutex); >>>>> ovsthread_key_delete(dp->per_pmd_key); >>>>> >>>>> @@ -3302,12 +3318,29 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) >OVS_REQUIRES(dp- >>>>>> port_mutex) >>>>> } >>>>> >>>>> static void >>>>> +reload_affected_pmds(struct dp_netdev *dp) >>>>> +{ >>>>> + struct dp_netdev_pmd_thread *pmd; >>>>> + >>>>> + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { >>>>> + if (pmd->need_reload) { >>>>> + dp_netdev_reload_pmd__(pmd); >>>>> + pmd->need_reload = false; >>>>> + } >>>>> + } >>>>> +} >>>>> + >>>>> +static void >>>>> reconfigure_pmd_threads(struct dp_netdev *dp) >>>>> OVS_REQUIRES(dp->port_mutex) >>>>> { >>>>> struct dp_netdev_pmd_thread *pmd; >>>>> struct ovs_numa_dump *pmd_cores; >>>>> + struct ovs_numa_info_core *core; >>>>> + struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete); >>>>> + struct hmapx_node *node; >>>>> bool changed = false; >>>>> + bool need_to_adjust_static_tx_qids = false; >>>>> >>>>> /* The pmd threads should be started only if there's a pmd port in the >>>>> * datapath. If the user didn't provide any "pmd-cpu-mask", we start >>>>> @@ -3320,40 +3353,61 @@ reconfigure_pmd_threads(struct dp_netdev *dp) >>>>> pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS); >>>>> } >>>>> >>>>> - /* Check for changed configuration */ >>>>> - if (ovs_numa_dump_count(pmd_cores) != cmap_count(&dp->poll_threads) >>>>> - 1) { >>>>> - changed = true; >>>>> - } else { >>>>> - CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { >>>>> - if (pmd->core_id != NON_PMD_CORE_ID >>>>> - && !ovs_numa_dump_contains_core(pmd_cores, >>>>> - pmd->numa_id, >>>>> - pmd->core_id)) { >>>>> - changed = true; >>>>> - break; >>>>> - } >>>>> + /* We need to adjust 'static_tx_qid's only if we're reducing number >>>>> of >>>>> + * PMD threads. Otherwise, new threads will allocate all the freed >>>>> ids. */ >>>> >>>> Can you elaborate on why the static_tx_qids need to be adjusted - it's >>>> still not clear to >>> me from the comment. >>> >>> I think, currently, the main case for keeping tx queue ids sequential >>> is the vhost-user ports with big number of queues but disabled by >>> driver in guest. >>> >>> If vhost-user device has enough number of queues to not use dynamic >>> distribution (XPS) and some of queues will be disabled inside the >>> guest, we will face almost same issue as described in >>> 347ba9bb9b7a ("dpif-netdev: Unique and sequential tx_qids.") >>> >>> For example: >>> >>> We may have 4 threads and vhost-user port with 4 queues. >>> Lets assume that static tx qids distributed as follows: >>> thread #0 --> queue #0 >>> thread #1 --> queue #1 >>> thread #2 --> queue #3 >>> thread #3 --> queue #2 >>> Now guest disables queue #3 (ethtool -L eth0 combined 3) >>> And we're manually changing pmd-cpu-mask to remove thread #3. >>> (You need to be sure that threads wasn't reloaded here.) >>> At this point, if tx queue ids wasn't adjusted, we will have: >>> thread #0 --> queue #0 --> queue #0 >>> thread #1 --> queue #1 --> queue #1 >>> thread #2 --> queue #3 --> queue #0 >>> Where the last column is the result of remapping inside >>> __netdev_dpdk_vhost_send(). >>> So, in this case, there will be 3 queues available, but >>> only 2 used by 3 threads. --> unused queue #2 and locking >>> on access to queue #0. >> >> Okay, I got it - thanks for the detailed explanation Ilya. >> >> With that, Acked-by: Mark Kavanagh <mark.b.kavan...@intel.com> > >Hi Mark. Do you think I need to add some comment about txq adjustment >in the code? I'm going to respin the series because of compiler >warning in the second patch found by Ian and want to know your >opinion. > >Also, I'm going to keep your Tested-by tag because of no code changes >in this patch planned. I'll add your Acked-by in case no additional >comment needed or if below example looks good to you. > >I can add something like this: > > /* We need to adjust 'static_tx_qid's only if we're reducing number of > * PMD threads. Otherwise, new threads will allocate all the freed ids. */ > if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) { > /* Adjustment is required to keep 'static_tx_qid's sequential and > * avoid possible issues, for example, imbalanced tx queue usage > * and unnecessary locking caused by remapping on netdev level. */ > need_to_adjust_static_tx_qids = true; > } > >What do you think?
Hey Ilya, Sounds good to me - feel free to add my [acked|tested]-by tags on the V3 patch. Thanks! Mark > >>> >>>>> + if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - >>>>> 1) { >>>>> + need_to_adjust_static_tx_qids = true; >>>>> + } >>>>> + >>>>> + /* Check for unwanted pmd threads */ >>>>> + CMAP_FOR_EACH(pmd, node, &dp->poll_threads) { >>>>> + if (pmd->core_id == NON_PMD_CORE_ID) { >>>>> + continue; >>>>> + } >>>>> + if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id, >>>>> + pmd->core_id)) { >>>>> + hmapx_add(&to_delete, pmd); >>>>> + } else if (need_to_adjust_static_tx_qids) { >>>>> + pmd->need_reload = true; >>>>> } >>>>> } >>>>> >>>>> - /* Destroy the old and recreate the new pmd threads. We don't >>>>> perform an >>>>> - * incremental update because we would have to adjust >>>>> 'static_tx_qid'. */ >>>>> - if (changed) { >>>>> - struct ovs_numa_info_core *core; >>>>> - struct ovs_numa_info_numa *numa; >>>>> + HMAPX_FOR_EACH (node, &to_delete) { >>>>> + pmd = (struct dp_netdev_pmd_thread *) node->data; >>>>> + VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.", >>>>> + pmd->numa_id, pmd->core_id); >>>>> + dp_netdev_del_pmd(dp, pmd); >>>>> + } >>>>> + changed = !hmapx_is_empty(&to_delete); >>>>> + hmapx_destroy(&to_delete); >>>>> >>>>> - /* Do not destroy the non pmd thread. */ >>>>> - dp_netdev_destroy_all_pmds(dp, false); >>>>> - FOR_EACH_CORE_ON_DUMP (core, pmd_cores) { >>>>> - struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd); >>>>> + if (need_to_adjust_static_tx_qids) { >>>>> + /* 'static_tx_qid's are not sequential now. >>>>> + * Reload remaining threads to fix this. */ >>>>> + reload_affected_pmds(dp); >>>>> + } >>>>> >>>>> + /* Check for required new pmd threads */ >>>>> + FOR_EACH_CORE_ON_DUMP(core, pmd_cores) { >>>>> + pmd = dp_netdev_get_pmd(dp, core->core_id); >>>>> + if (!pmd) { >>>>> + pmd = xzalloc(sizeof *pmd); >>>>> dp_netdev_configure_pmd(pmd, dp, core->core_id, >>>>> core->numa_id); >>>>> - >>>>> pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd); >>>>> + VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.", >>>>> + pmd->numa_id, pmd->core_id); >>>>> + changed = true; >>>>> + } else { >>>>> + dp_netdev_pmd_unref(pmd); >>>>> } >>>>> + } >>>>> + >>>>> + if (changed) { >>>>> + struct ovs_numa_info_numa *numa; >>>>> >>>>> /* Log the number of pmd threads per numa node. */ >>>>> FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) { >>>>> - VLOG_INFO("Created %"PRIuSIZE" pmd threads on numa node %d", >>>>> + VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node >>>>> %d", >>>>> numa->n_cores, numa->numa_id); >>>>> } >>>>> } >>>>> @@ -3362,19 +3416,6 @@ reconfigure_pmd_threads(struct dp_netdev *dp) >>>>> } >>>>> >>>>> static void >>>>> -reload_affected_pmds(struct dp_netdev *dp) >>>>> -{ >>>>> - struct dp_netdev_pmd_thread *pmd; >>>>> - >>>>> - CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { >>>>> - if (pmd->need_reload) { >>>>> - dp_netdev_reload_pmd__(pmd); >>>>> - pmd->need_reload = false; >>>>> - } >>>>> - } >>>>> -} >>>>> - >>>>> -static void >>>>> pmd_remove_stale_ports(struct dp_netdev *dp, >>>>> struct dp_netdev_pmd_thread *pmd) >>>>> OVS_EXCLUDED(pmd->port_mutex) >>>>> @@ -3673,6 +3714,28 @@ pmd_load_cached_ports(struct dp_netdev_pmd_thread >>>>> *pmd) >>>>> } >>>>> } >>>>> >>>>> +static void >>>>> +pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd) >>>>> +{ >>>>> + ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex); >>>>> + if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) { >>>>> + VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d" >>>>> + ", numa_id %d.", pmd->core_id, pmd->numa_id); >>>>> + } >>>>> + ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex); >>>>> + >>>>> + VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d" >>>>> + ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, >>>>> pmd->numa_id); >>>>> +} >>>>> + >>>>> +static void >>>>> +pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd) >>>>> +{ >>>>> + ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex); >>>>> + id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid); >>>>> + ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex); >>>>> +} >>>>> + >>>>> static int >>>>> pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd, >>>>> struct polled_queue **ppoll_list) >>>>> @@ -3718,6 +3781,7 @@ pmd_thread_main(void *f_) >>>>> dpdk_set_lcore_id(pmd->core_id); >>>>> poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); >>>>> reload: >>>>> + pmd_alloc_static_tx_qid(pmd); >>>>> emc_cache_init(&pmd->flow_cache); >>>>> >>>>> /* List port/core affinity */ >>>>> @@ -3766,6 +3830,7 @@ reload: >>>>> dp_netdev_pmd_reload_done(pmd); >>>>> >>>>> emc_cache_uninit(&pmd->flow_cache); >>>>> + pmd_free_static_tx_qid(pmd); >>>>> >>>>> if (!exiting) { >>>>> goto reload; >>>>> @@ -4176,8 +4241,6 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread >>>>> *pmd, struct >>>>> dp_netdev *dp, >>>>> pmd->numa_id = numa_id; >>>>> pmd->need_reload = false; >>>>> >>>>> - *CONST_CAST(int *, &pmd->static_tx_qid) = >>>>> cmap_count(&dp->poll_threads); >>>>> - >>>>> ovs_refcount_init(&pmd->ref_cnt); >>>>> latch_init(&pmd->exit_latch); >>>>> pmd->reload_seq = seq_create(); >>>>> @@ -4198,6 +4261,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread >>>>> *pmd, struct >>>>> dp_netdev *dp, >>>>> * actual thread created for NON_PMD_CORE_ID. */ >>>>> if (core_id == NON_PMD_CORE_ID) { >>>>> emc_cache_init(&pmd->flow_cache); >>>>> + pmd_alloc_static_tx_qid(pmd); >>>>> } >>>>> cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, >>>>> &pmd->node), >>>>> hash_int(core_id, 0)); >>>>> @@ -4240,6 +4304,7 @@ dp_netdev_del_pmd(struct dp_netdev *dp, struct >dp_netdev_pmd_thread >>>>> *pmd) >>>>> ovs_mutex_lock(&dp->non_pmd_mutex); >>>>> emc_cache_uninit(&pmd->flow_cache); >>>>> pmd_free_cached_ports(pmd); >>>>> + pmd_free_static_tx_qid(pmd); >>>>> ovs_mutex_unlock(&dp->non_pmd_mutex); >>>>> } else { >>>>> latch_set(&pmd->exit_latch); >>>>> diff --git a/tests/pmd.at b/tests/pmd.at >>>>> index 05755b3..39a3645 100644 >>>>> --- a/tests/pmd.at >>>>> +++ b/tests/pmd.at >>>>> @@ -38,7 +38,7 @@ dnl >>>>> dnl Whaits for creation of 'n_threads' or at least 1 thread if $1 not >>>>> dnl passed. Checking starts from line number 'line' in ovs-vswithd.log . >>>>> m4_define([CHECK_PMD_THREADS_CREATED], [ >>>>> - PATTERN="Created [[0-9]]* pmd threads on numa node $2" >>>>> + PATTERN="There are [[0-9]]* pmd threads on numa node $2" >>>>> line_st=$3 >>>>> if [[ -z "$line_st" ]] >>>>> then >>>>> -- >>>>> 2.7.4 >>>>> >>>>> _______________________________________________ >>>>> dev mailing list >>>>> d...@openvswitch.org >>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev >>>> >>>> >>>> >> >> >> _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev