deletion of PMD threads.

Kavanagh, Mark B Tue, 06 Jun 2017 03:03:07 -0700


>From: Ilya Maximets [mailto:i.maxim...@samsung.com]
>Sent: Monday, June 5, 2017 9:18 AM
>To: Kavanagh, Mark B <mark.b.kavan...@intel.com>; d...@openvswitch.org; 
>Darrell Ball
><db...@vmware.com>
>Cc: Heetae Ahn <heetae82....@samsung.com>; Daniele Di Proietto 
><diproiet...@ovn.org>; Ben
>Pfaff <b...@ovn.org>; Pravin Shelar <pshe...@ovn.org>; Loftus, Ciara 
><ciara.lof...@intel.com>;
>Stokes, Ian <ian.sto...@intel.com>; Kevin Traynor <ktray...@redhat.com>
>Subject: Re: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental 
>addition/deletion of PMD
>threads.
>
>Thanks for testing, Mark.
>
>Comments inline.
>
>Best regards, Ilya Maximets.
>
>On 01.06.2017 17:30, Kavanagh, Mark B wrote:
>>> From: ovs-dev-boun...@openvswitch.org 
>>> [mailto:ovs-dev-boun...@openvswitch.org] On Behalf
>Of
>>> Ilya Maximets
>>> Sent: Tuesday, May 30, 2017 3:12 PM
>>> To: d...@openvswitch.org; Darrell Ball <db...@vmware.com>
>>> Cc: Heetae Ahn <heetae82....@samsung.com>; Ilya Maximets 
>>> <i.maxim...@samsung.com>
>>> Subject: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental 
>>> addition/deletion of PMD
>threads.
>>>
>>> Currently, change of 'pmd-cpu-mask' is very heavy operation.
>>> It requires destroying of all the PMD threads and creating
>>> them back. After that, all the threads will sleep until
>>> ports' redistribution finished.
>>>
>>> This patch adds ability to not stop the datapath while
>>> adjusting number/placement of PMD threads. All not affected
>>> threads will forward traffic without any additional latencies.
>>>
>>> id-pool created for static tx queue ids to keep them sequential
>>> in a flexible way. non-PMD thread will always have
>>> static_tx_qid = 0 as it was before.
>>>
>>> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
>>
>> Hi Ilya,
>>
>> Patch LGTM - just once comment inline.
>>
>> I conducted some light testing and observed the following:
>>     - temporary dip in forwarding rate (~50%) when modifying pmd-cpu-mask 
>> (in previous
>implementation, forwarding stopped completely)
>>         - occasional drop to zero in forwarding rate when modifying 
>> pmd-cpu-mask (e.g.
>change from '2' to '12')
>
>I guess, drop to zero in this case could be fixed by second
>patch of the series. (If it caused by port reconfiguration)


Yes, that's true - my colleague, Ian Stokes, will review and test that patch 
shortly.

>
>>     - temporary drop of ~1mpps when adding/deleting port (similar to existing
>implementation)
>>
>> I've also conducted the following sanity checks on the patch, and found no 
>> issues:
>>  - checkpatch.py
>>  - compilation with clang
>>  - compilation with gcc
>>  - sparse
>>  - make check (no additional tests fail after application of this patch)
>>
>> Tested-by: Mark Kavanagh <mark.b.kavan...@intel.com>
>>
>> Thanks,
>> Mark
>>
>>> ---
>>> lib/dpif-netdev.c | 143 
>>> +++++++++++++++++++++++++++++++++++++++---------------
>>> tests/pmd.at      |   2 +-
>>> 2 files changed, 105 insertions(+), 40 deletions(-)
>>>
>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>>> index b50164b..596d133 100644
>>> --- a/lib/dpif-netdev.c
>>> +++ b/lib/dpif-netdev.c
>>> @@ -48,6 +48,7 @@
>>> #include "fat-rwlock.h"
>>> #include "flow.h"
>>> #include "hmapx.h"
>>> +#include "id-pool.h"
>>> #include "latch.h"
>>> #include "netdev.h"
>>> #include "netdev-vport.h"
>>> @@ -277,6 +278,9 @@ struct dp_netdev {
>>>
>>>     /* Stores all 'struct dp_netdev_pmd_thread's. */
>>>     struct cmap poll_threads;
>>> +    /* id pool for per thread static_tx_qid. */
>>> +    struct id_pool *tx_qid_pool;
>>> +    struct ovs_mutex tx_qid_pool_mutex;
>>>
>>>     /* Protects the access of the 'struct dp_netdev_pmd_thread'
>>>      * instance for non-pmd thread. */
>>> @@ -562,7 +566,7 @@ struct dp_netdev_pmd_thread {
>>>     /* Queue id used by this pmd thread to send packets on all netdevs if
>>>      * XPS disabled for this netdev. All static_tx_qid's are unique and less
>>>      * than 'cmap_count(dp->poll_threads)'. */
>>> -    const int static_tx_qid;
>>> +    uint32_t static_tx_qid;
>>>
>>>     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 
>>> 'tx_ports'. */
>>>     /* List of rx queues to poll. */
>>> @@ -642,6 +646,8 @@ static struct dp_netdev_pmd_thread 
>>> *dp_netdev_get_pmd(struct dp_netdev
>>> *dp,
>>>                                                       unsigned core_id);
>>> static struct dp_netdev_pmd_thread *
>>> dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
>>> +static void dp_netdev_del_pmd(struct dp_netdev *dp,
>>> +                              struct dp_netdev_pmd_thread *pmd);
>>> static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
>>> static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
>>> static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
>>> @@ -1181,10 +1187,17 @@ create_dp_netdev(const char *name, const struct 
>>> dpif_class *class,
>>>     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
>>>
>>>     cmap_init(&dp->poll_threads);
>>> +
>>> +    ovs_mutex_init(&dp->tx_qid_pool_mutex);
>>> +    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. 
>>> */
>>> +    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
>>> +
>>>     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
>>>     ovsthread_key_create(&dp->per_pmd_key, NULL);
>>>
>>>     ovs_mutex_lock(&dp->port_mutex);
>>> +    /* non-PMD will be created before all other threads and will
>>> +     * allocate static_tx_qid = 0. */
>>>     dp_netdev_set_nonpmd(dp);
>>>
>>>     error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
>>> @@ -1279,6 +1292,9 @@ dp_netdev_free(struct dp_netdev *dp)
>>>     dp_netdev_destroy_all_pmds(dp, true);
>>>     cmap_destroy(&dp->poll_threads);
>>>
>>> +    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
>>> +    id_pool_destroy(dp->tx_qid_pool);
>>> +
>>>     ovs_mutex_destroy(&dp->non_pmd_mutex);
>>>     ovsthread_key_delete(dp->per_pmd_key);
>>>
>>> @@ -3302,12 +3318,29 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) 
>>> OVS_REQUIRES(dp-
>>>> port_mutex)
>>> }
>>>
>>> static void
>>> +reload_affected_pmds(struct dp_netdev *dp)
>>> +{
>>> +    struct dp_netdev_pmd_thread *pmd;
>>> +
>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>> +        if (pmd->need_reload) {
>>> +            dp_netdev_reload_pmd__(pmd);
>>> +            pmd->need_reload = false;
>>> +        }
>>> +    }
>>> +}
>>> +
>>> +static void
>>> reconfigure_pmd_threads(struct dp_netdev *dp)
>>>     OVS_REQUIRES(dp->port_mutex)
>>> {
>>>     struct dp_netdev_pmd_thread *pmd;
>>>     struct ovs_numa_dump *pmd_cores;
>>> +    struct ovs_numa_info_core *core;
>>> +    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
>>> +    struct hmapx_node *node;
>>>     bool changed = false;
>>> +    bool need_to_adjust_static_tx_qids = false;
>>>
>>>     /* The pmd threads should be started only if there's a pmd port in the
>>>      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
>>> @@ -3320,40 +3353,61 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
>>>         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
>>>     }
>>>
>>> -    /* Check for changed configuration */
>>> -    if (ovs_numa_dump_count(pmd_cores) != cmap_count(&dp->poll_threads) - 
>>> 1) {
>>> -        changed = true;
>>> -    } else {
>>> -        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>> -            if (pmd->core_id != NON_PMD_CORE_ID
>>> -                && !ovs_numa_dump_contains_core(pmd_cores,
>>> -                                                pmd->numa_id,
>>> -                                                pmd->core_id)) {
>>> -                changed = true;
>>> -                break;
>>> -            }
>>> +    /* We need to adjust 'static_tx_qid's only if we're reducing number of
>>> +     * PMD threads. Otherwise, new threads will allocate all the freed 
>>> ids. */
>>
>> Can you elaborate on why the static_tx_qids need to be adjusted - it's still 
>> not clear to
>me from the comment.
>
>I think, currently, the main case for keeping tx queue ids sequential
>is the vhost-user ports with big number of queues but disabled by
>driver in guest.
>
>If vhost-user device has enough number of queues to not use dynamic
>distribution (XPS) and some of queues will be disabled inside the
>guest, we will face almost same issue as described in
>347ba9bb9b7a ("dpif-netdev: Unique and sequential tx_qids.")
>
>For example:
>
>       We may have 4 threads and vhost-user port with 4 queues.
>       Lets assume that static tx qids distributed as follows:
>               thread #0 --> queue #0
>               thread #1 --> queue #1
>               thread #2 --> queue #3
>               thread #3 --> queue #2
>       Now guest disables queue #3 (ethtool -L eth0 combined 3)
>       And we're manually changing pmd-cpu-mask to remove thread #3.
>       (You need to be sure that threads wasn't reloaded here.)
>       At this point, if tx queue ids wasn't adjusted, we will have:
>               thread #0 --> queue #0 --> queue #0
>               thread #1 --> queue #1 --> queue #1
>               thread #2 --> queue #3 --> queue #0
>       Where the last column is the result of remapping inside
>       __netdev_dpdk_vhost_send().
>       So, in this case, there will be 3 queues available, but
>       only 2 used by 3 threads. --> unused queue #2 and locking
>       on access to queue #0.

Okay, I got it - thanks for the detailed explanation Ilya.

With that, Acked-by: Mark Kavanagh <mark.b.kavan...@intel.com> 


>
>>> +    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 
>>> 1) {
>>> +        need_to_adjust_static_tx_qids = true;
>>> +    }
>>> +
>>> +    /* Check for unwanted pmd threads */
>>> +    CMAP_FOR_EACH(pmd, node, &dp->poll_threads) {
>>> +        if (pmd->core_id == NON_PMD_CORE_ID) {
>>> +            continue;
>>> +        }
>>> +        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
>>> +                                                    pmd->core_id)) {
>>> +            hmapx_add(&to_delete, pmd);
>>> +        } else if (need_to_adjust_static_tx_qids) {
>>> +            pmd->need_reload = true;
>>>         }
>>>     }
>>>
>>> -    /* Destroy the old and recreate the new pmd threads.  We don't perform 
>>> an
>>> -     * incremental update because we would have to adjust 'static_tx_qid'. 
>>> */
>>> -    if (changed) {
>>> -        struct ovs_numa_info_core *core;
>>> -        struct ovs_numa_info_numa *numa;
>>> +    HMAPX_FOR_EACH (node, &to_delete) {
>>> +        pmd = (struct dp_netdev_pmd_thread *) node->data;
>>> +        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
>>> +                  pmd->numa_id, pmd->core_id);
>>> +        dp_netdev_del_pmd(dp, pmd);
>>> +    }
>>> +    changed = !hmapx_is_empty(&to_delete);
>>> +    hmapx_destroy(&to_delete);
>>>
>>> -        /* Do not destroy the non pmd thread. */
>>> -        dp_netdev_destroy_all_pmds(dp, false);
>>> -        FOR_EACH_CORE_ON_DUMP (core, pmd_cores) {
>>> -            struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
>>> +    if (need_to_adjust_static_tx_qids) {
>>> +        /* 'static_tx_qid's are not sequential now.
>>> +         * Reload remaining threads to fix this. */
>>> +        reload_affected_pmds(dp);
>>> +    }
>>>
>>> +    /* Check for required new pmd threads */
>>> +    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
>>> +        pmd = dp_netdev_get_pmd(dp, core->core_id);
>>> +        if (!pmd) {
>>> +            pmd = xzalloc(sizeof *pmd);
>>>             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
>>> -
>>>             pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
>>> +            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
>>> +                      pmd->numa_id, pmd->core_id);
>>> +            changed = true;
>>> +        } else {
>>> +            dp_netdev_pmd_unref(pmd);
>>>         }
>>> +    }
>>> +
>>> +    if (changed) {
>>> +        struct ovs_numa_info_numa *numa;
>>>
>>>         /* Log the number of pmd threads per numa node. */
>>>         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
>>> -            VLOG_INFO("Created %"PRIuSIZE" pmd threads on numa node %d",
>>> +            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
>>>                       numa->n_cores, numa->numa_id);
>>>         }
>>>     }
>>> @@ -3362,19 +3416,6 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
>>> }
>>>
>>> static void
>>> -reload_affected_pmds(struct dp_netdev *dp)
>>> -{
>>> -    struct dp_netdev_pmd_thread *pmd;
>>> -
>>> -    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>> -        if (pmd->need_reload) {
>>> -            dp_netdev_reload_pmd__(pmd);
>>> -            pmd->need_reload = false;
>>> -        }
>>> -    }
>>> -}
>>> -
>>> -static void
>>> pmd_remove_stale_ports(struct dp_netdev *dp,
>>>                        struct dp_netdev_pmd_thread *pmd)
>>>     OVS_EXCLUDED(pmd->port_mutex)
>>> @@ -3673,6 +3714,28 @@ pmd_load_cached_ports(struct dp_netdev_pmd_thread 
>>> *pmd)
>>>     }
>>> }
>>>
>>> +static void
>>> +pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
>>> +{
>>> +    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
>>> +    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
>>> +        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
>>> +                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
>>> +    }
>>> +    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
>>> +
>>> +    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
>>> +             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, 
>>> pmd->numa_id);
>>> +}
>>> +
>>> +static void
>>> +pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
>>> +{
>>> +    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
>>> +    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
>>> +    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
>>> +}
>>> +
>>> static int
>>> pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
>>>                           struct polled_queue **ppoll_list)
>>> @@ -3718,6 +3781,7 @@ pmd_thread_main(void *f_)
>>>     dpdk_set_lcore_id(pmd->core_id);
>>>     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>>> reload:
>>> +    pmd_alloc_static_tx_qid(pmd);
>>>     emc_cache_init(&pmd->flow_cache);
>>>
>>>     /* List port/core affinity */
>>> @@ -3766,6 +3830,7 @@ reload:
>>>     dp_netdev_pmd_reload_done(pmd);
>>>
>>>     emc_cache_uninit(&pmd->flow_cache);
>>> +    pmd_free_static_tx_qid(pmd);
>>>
>>>     if (!exiting) {
>>>         goto reload;
>>> @@ -4176,8 +4241,6 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread 
>>> *pmd, struct
>>> dp_netdev *dp,
>>>     pmd->numa_id = numa_id;
>>>     pmd->need_reload = false;
>>>
>>> -    *CONST_CAST(int *, &pmd->static_tx_qid) = 
>>> cmap_count(&dp->poll_threads);
>>> -
>>>     ovs_refcount_init(&pmd->ref_cnt);
>>>     latch_init(&pmd->exit_latch);
>>>     pmd->reload_seq = seq_create();
>>> @@ -4198,6 +4261,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread 
>>> *pmd, struct
>>> dp_netdev *dp,
>>>      * actual thread created for NON_PMD_CORE_ID. */
>>>     if (core_id == NON_PMD_CORE_ID) {
>>>         emc_cache_init(&pmd->flow_cache);
>>> +        pmd_alloc_static_tx_qid(pmd);
>>>     }
>>>     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, 
>>> &pmd->node),
>>>                 hash_int(core_id, 0));
>>> @@ -4240,6 +4304,7 @@ dp_netdev_del_pmd(struct dp_netdev *dp, struct 
>>> dp_netdev_pmd_thread
>>> *pmd)
>>>         ovs_mutex_lock(&dp->non_pmd_mutex);
>>>         emc_cache_uninit(&pmd->flow_cache);
>>>         pmd_free_cached_ports(pmd);
>>> +        pmd_free_static_tx_qid(pmd);
>>>         ovs_mutex_unlock(&dp->non_pmd_mutex);
>>>     } else {
>>>         latch_set(&pmd->exit_latch);
>>> diff --git a/tests/pmd.at b/tests/pmd.at
>>> index 05755b3..39a3645 100644
>>> --- a/tests/pmd.at
>>> +++ b/tests/pmd.at
>>> @@ -38,7 +38,7 @@ dnl
>>> dnl Whaits for creation of 'n_threads' or at least 1 thread if $1 not
>>> dnl passed. Checking starts from line number 'line' in ovs-vswithd.log .
>>> m4_define([CHECK_PMD_THREADS_CREATED], [
>>> -    PATTERN="Created [[0-9]]* pmd threads on numa node $2"
>>> +    PATTERN="There are [[0-9]]* pmd threads on numa node $2"
>>>     line_st=$3
>>>     if [[ -z "$line_st" ]]
>>>     then
>>> --
>>> 2.7.4
>>>
>>> _______________________________________________
>>> dev mailing list
>>> d...@openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
>>
>>
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental addition/deletion of PMD threads.

Reply via email to