On Fri, Mar 28, 2025 at 06:00:44AM -0500, Tirthendu Sarkar wrote:
> Streamline code for AVX512 and SSE by consolidating the common code and
> adding runtime check for selecting appropriate path based on CPU
> capability.
> 
> Signed-off-by: Tirthendu Sarkar <tirthendu.sar...@intel.com>
> ---
> v2:
>  - Addressed review comments [Bruce Richardson]

Tested that we can still get the function pointer set to the AVX-512 path
in a generic build.

Acked-by: Bruce Richardson <bruce.richard...@intel.com>

Some additional feedback inline below. Probably want to do a v3 to fix some
of them.


> 
>  drivers/event/dlb2/dlb2.c        | 199 ++++++++++++++++++++-
>  drivers/event/dlb2/dlb2_avx512.c | 298 ++++---------------------------
>  drivers/event/dlb2/dlb2_priv.h   |   9 +-
>  drivers/event/dlb2/dlb2_sse.c    | 210 +---------------------
>  4 files changed, 241 insertions(+), 475 deletions(-)
> 
> diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
> index 934fcafcfe..4c0b4686a4 100644
> --- a/drivers/event/dlb2/dlb2.c
> +++ b/drivers/event/dlb2/dlb2.c
> @@ -90,6 +90,9 @@ static struct rte_event_dev_info evdev_dlb2_default_info = {
>  struct process_local_port_data
>  dlb2_port[DLB2_MAX_NUM_PORTS_ALL][DLB2_NUM_PORT_TYPES];
>  
> +static void
> +(*dlb2_build_qes)(struct dlb2_enqueue_qe *qe, const struct rte_event ev[], 
> __m128i sse_qe[]);
> +
>  static void
>  dlb2_free_qe_mem(struct dlb2_port *qm_port)
>  {
> @@ -2069,9 +2072,9 @@ dlb2_eventdev_port_setup(struct rte_eventdev *dev,
>  
>       if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512VL) &&
>           rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
> -             ev_port->qm_port.use_avx512 = true;
> +             dlb2_build_qes = dlb2_build_qes_avx512;
>       else
> -             ev_port->qm_port.use_avx512 = false;
> +             dlb2_build_qes = dlb2_build_qes_sse;
>  
>       return 0;
>  }
> @@ -2669,6 +2672,21 @@ dlb2_eventdev_start(struct rte_eventdev *dev)
>       return 0;
>  }
>  
> +static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
> +     {
> +             /* Load-balanced cmd bytes */
> +             [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
> +             [RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
> +             [RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
> +     },
> +     {
> +             /* Directed cmd bytes */
> +             [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
> +             [RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
> +             [RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
> +     },
> +};

Minor nit, but this seems in a strange position in the file, being a
global. As far as I can see, it's only used by the one function -
dlb2_event_build_hcws() - so maybe make it a static local variable there.

> +
>  static inline uint32_t
>  dlb2_port_credits_get(struct dlb2_port *qm_port,
>                     enum dlb2_hw_queue_types type)
> @@ -2887,6 +2905,183 @@ dlb2_construct_token_pop_qe(struct dlb2_port 
> *qm_port, int idx)
>       qm_port->owed_tokens = 0;
>  }
>  
> +static inline void
> +dlb2_event_build_hcws(struct dlb2_port *qm_port,
> +                   const struct rte_event ev[],
> +                   int num,
> +                   uint8_t *sched_type,
> +                   uint8_t *queue_id)
> +{

<snip>

> --- a/drivers/event/dlb2/dlb2_sse.c
> +++ b/drivers/event/dlb2/dlb2_sse.c
> @@ -2,172 +2,15 @@
>   * Copyright(c) 2022 Intel Corporation
>   */
>  
> -#include <stdint.h>
> -#include <stdbool.h>
> -
> -#ifndef CC_AVX512_SUPPORT
> -
>  #include "dlb2_priv.h"
> -#include "dlb2_iface.h"
> -#include "dlb2_inline_fns.h"
> -
>  /*
>   * This source file is only used when the compiler on the build machine
>   * does not support AVX512VL.
>   */

This comment needs updating. It's now used when the runtime platform
doesn't support AVX512.

>  
> -static uint8_t cmd_byte_map[DLB2_NUM_PORT_TYPES][DLB2_NUM_HW_SCHED_TYPES] = {
> -     {
> -             /* Load-balanced cmd bytes */
> -             [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
> -             [RTE_EVENT_OP_FORWARD] = DLB2_FWD_CMD_BYTE,
> -             [RTE_EVENT_OP_RELEASE] = DLB2_COMP_CMD_BYTE,
> -     },
> -     {
> -             /* Directed cmd bytes */
> -             [RTE_EVENT_OP_NEW] = DLB2_NEW_CMD_BYTE,
> -             [RTE_EVENT_OP_FORWARD] = DLB2_NEW_CMD_BYTE,
> -             [RTE_EVENT_OP_RELEASE] = DLB2_NOOP_CMD_BYTE,
> -     },
> -};

<snip>

> +             _mm_storel_epi64((__m128i *)&qe[0].u.opaque_data, sse_qe[0]);
> +             _mm_storeh_pd((double *)&qe[1].u.opaque_data, 
> (__m128d)sse_qe[0]);
> +             _mm_storel_epi64((__m128i *)&qe[2].u.opaque_data, sse_qe[1]);
> +             _mm_storeh_pd((double *)&qe[3].u.opaque_data, 
> (__m128d)sse_qe[1]);
>  
>               qe[0].data = ev[0].u64;
>               qe[1].data = ev[1].u64;
>               qe[2].data = ev[2].u64;
>               qe[3].data = ev[3].u64;

While I'm not reviewing in detail the SSE/AVX512 code, since this patch
just seems to be moving the code around rather than writing it new, the
approach for building the 4 QEs seems a little strange, in that you do a
lot of work packing the data for 4 QEs into two SSE registers only to then
go unpacking them again. This leads to extra complexity having to document
in comments exactly how things are packed  Why not just build the metadata
for each QE directly into a single SSE register directly without packing?

/Bruce

Reply via email to