> From: Harry van Haaren <harry.van.haa...@intel.com> > > This commit adds AVX512 implementations of miniflow extract. > By using the 64 bytes available in an AVX512 register, it is > possible to convert a packet to a miniflow data-structure in > a small quantity instructions. > > The implementation here probes for Ether()/IP()/UDP() traffic, > and builds the appropriate miniflow data-structure for packets > that match the probe. > > The implementation here is auto-validated by the miniflow > extract autovalidator, hence its correctness can be easily > tested and verified. > > Note that this commit is designed to easily allow addition of new > traffic profiles in a scalable way, without code duplication for > each traffic profile. >
Thanks Harry/Amber. Agree with what Flavio has proposed so far as well. A few more comments inline below. Note: A few comments refer to Comment coding style, I haven't called out every instance as there are quite a few, but would recommend giving the comments in particularly a look over to ensure they meet standards. BR Ian > Signed-off-by: Harry van Haaren <harry.van.haa...@intel.com> > --- > lib/automake.mk | 1 + > lib/dpif-netdev-extract-avx512.c | 416 ++++++++++++++++++++++++++++++ > lib/dpif-netdev-private-extract.c | 15 ++ > lib/dpif-netdev-private-extract.h | 19 ++ > 4 files changed, 451 insertions(+) > create mode 100644 lib/dpif-netdev-extract-avx512.c > > diff --git a/lib/automake.mk b/lib/automake.mk > index 3080bb04a..2b95d6f92 100644 > --- a/lib/automake.mk > +++ b/lib/automake.mk > @@ -39,6 +39,7 @@ lib_libopenvswitchavx512_la_CFLAGS = \ > $(AM_CFLAGS) > lib_libopenvswitchavx512_la_SOURCES = \ > lib/dpif-netdev-lookup-avx512-gather.c \ > + lib/dpif-netdev-extract-avx512.c \ > lib/dpif-netdev-avx512.c > lib_libopenvswitchavx512_la_LDFLAGS = \ > -static > diff --git a/lib/dpif-netdev-extract-avx512.c > b/lib/dpif-netdev-extract-avx512.c > new file mode 100644 > index 000000000..1145ac8a9 > --- /dev/null > +++ b/lib/dpif-netdev-extract-avx512.c > @@ -0,0 +1,416 @@ > +/* > + * Copyright (c) 2021 Intel. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#ifdef __x86_64__ > +/* Sparse cannot handle the AVX512 instructions. */ > +#if !defined(__CHECKER__) > + > +#include <config.h> > +#include <errno.h> > +#include <immintrin.h> > +#include <stdint.h> > +#include <string.h> > + > +#include "flow.h" > +#include "dpdk.h" > + > +#include "dpif-netdev-private-dpcls.h" > +#include "dpif-netdev-private-extract.h" > +#include "dpif-netdev-private-flow.h" > + > +/* AVX512-BW level permutex2var_epi8 emulation. */ > +static inline __m512i > +__attribute__((target("avx512bw"))) > +_mm512_maskz_permutex2var_epi8_skx(__mmask64 k_mask, > + __m512i v_data_0, > + __m512i v_shuf_idxs, > + __m512i v_data_1) > +{ > + /* Manipulate shuffle indexes for u16 size. */ > + __mmask64 k_mask_odd_lanes = 0xAAAAAAAAAAAAAAAA; > + /* clear away ODD lane bytes. Cannot be done above due to no u8 shift */ Coding standard for comments. Capitalize Clear and add period at end of comment. > + __m512i v_shuf_idx_evn = _mm512_mask_blend_epi8(k_mask_odd_lanes, > + v_shuf_idxs, _mm512_setzero_si512()); Alignment of arguments above seems a bit odd. Can we align vertically under k_mask_odd_lanes? > + v_shuf_idx_evn = _mm512_srli_epi16(v_shuf_idx_evn, 1); > + > + __m512i v_shuf_idx_odd = _mm512_srli_epi16(v_shuf_idxs, 9); > + > + /* Shuffle each half at 16-bit width */ For the comment above and multiple comments below, please add period at end of comment to keep with standard. > + __m512i v_shuf1 = _mm512_permutex2var_epi16(v_data_0, > v_shuf_idx_evn, > + v_data_1); > + __m512i v_shuf2 = _mm512_permutex2var_epi16(v_data_0, > v_shuf_idx_odd, > + v_data_1); > + > + /* Find if the shuffle index was odd, via mask and compare */ > + uint16_t index_odd_mask = 0x1; > + const __m512i v_index_mask_u16 = _mm512_set1_epi16(index_odd_mask); > + > + /* EVEN lanes, find if u8 index was odd, result as u16 bitmask */ > + __m512i v_idx_even_masked = _mm512_and_si512(v_shuf_idxs, > + v_index_mask_u16); > + __mmask32 evn_rotate_mask = > _mm512_cmpeq_epi16_mask(v_idx_even_masked, > + v_index_mask_u16); > + > + /* ODD lanes, find if u8 index was odd, result as u16 bitmask */ > + __m512i v_shuf_idx_srli8 = _mm512_srli_epi16(v_shuf_idxs, 8); > + __m512i v_idx_odd_masked = _mm512_and_si512(v_shuf_idx_srli8, > + v_index_mask_u16); > + __mmask32 odd_rotate_mask = > _mm512_cmpeq_epi16_mask(v_idx_odd_masked, > + v_index_mask_u16); > + odd_rotate_mask = ~odd_rotate_mask; > + > + /* Rotate and blend results from each index */ > + __m512i v_shuf_res_evn = _mm512_mask_srli_epi16(v_shuf1, > evn_rotate_mask, > + v_shuf1, 8); > + __m512i v_shuf_res_odd = _mm512_mask_slli_epi16(v_shuf2, > odd_rotate_mask, > + v_shuf2, 8); > + > + /* If shuffle index was odd, blend shifted version */ > + __m512i v_shuf_result = _mm512_mask_blend_epi8(k_mask_odd_lanes, > + v_shuf_res_evn, > v_shuf_res_odd); > + > + __m512i v_zeros = _mm512_setzero_si512(); > + __m512i v_result_kmskd = _mm512_mask_blend_epi8(k_mask, v_zeros, > + v_shuf_result); > + > + return v_result_kmskd; > +} > + > +/* Wrapper function required to enable ISA. */ > +static inline __m512i > +__attribute__((__target__("avx512vbmi"))) > +_mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask, __m512i idx, > __m512i a) > +{ > + return _mm512_maskz_permutexvar_epi8(kmask, idx, a); > +} > + > + > +/* This file contains optimized implementations of miniflow_extract() > + * for specific common traffic patterns. The optimizations allow for > + * quick probing of a specific packet type, and if a match with a specific > + * type is found, a shuffle like proceedure builds up the required miniflow. > + * > + * The functionality here can be easily auto-validated and tested against the > + * scalar miniflow_extract() function. As such, manual review of the code by > + * the community (although welcome) is not required. Confidence in the > + * correctness of the code can be had from the autovalidation. > + */ > + > +/* Generator for EtherType masks and values. */ > +#define PATTERN_ETHERTYPE_GEN(type_b0, type_b1) \ > + 0, 0, 0, 0, 0, 0, /* Ether MAC DST */ \ > + 0, 0, 0, 0, 0, 0, /* Ether MAC SRC */ \ > + type_b0, type_b1, /* EtherType */ > + > +#define PATTERN_ETHERTYPE_MASK PATTERN_ETHERTYPE_GEN(0xFF, 0xFF) > +#define PATTERN_ETHERTYPE_IPV4 PATTERN_ETHERTYPE_GEN(0x08, 0x00) > + > +/* Generator for checking IPv4 ver, ihl, and proto */ > +#define PATTERN_IPV4_GEN(VER_IHL, FLAG_OFF_B0, FLAG_OFF_B1, PROTO) \ > + VER_IHL, /* Version and IHL */ \ > + 0, 0, 0, /* DSCP, ECN, Total Lenght */ \ Typo above, Length. > + 0, 0, /* Identification */ \ > + /* Flags/Fragment offset: don't match MoreFrag (MF) or FragOffset */ \ > + FLAG_OFF_B0, FLAG_OFF_B1, \ > + 0, /* TTL */ \ > + PROTO, /* Protocol */ \ > + 0, 0, /* Header checksum */ \ > + 0, 0, 0, 0, /* Src IP */ \ > + 0, 0, 0, 0, /* Dst IP */ > + > +#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xFE, 0xFF, 0xFF) > +#define PATTERN_IPV4_UDP PATTERN_IPV4_GEN(0x45, 0, 0, 0x11) > +#define PATTERN_IPV4_TCP PATTERN_IPV4_GEN(0x45, 0, 0, 0x06) > + > +#define NU 0 > +#define PATTERN_IPV4_UDP_SHUFFLE \ > + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, NU, NU, /* Ether > */ \ > + 26, 27, 28, 29, 30, 31, 32, 33, NU, NU, NU, NU, 20, 15, 22, 23, /* IPv4 */ > \ > + 34, 35, 36, 37, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* UDP > */ \ > + NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* > Unused. */ > + > + > +/* Generation of K-mask bitmask values, to zero out data in result. Note that > + * these correspond 1:1 to the above "*_SHUFFLE" values, and bit used must be > + * set in this K-mask, and "NU" values must be zero in the k-mask. Each mask > + * defined here represents 2 blocks, so 16 bytes, so 4 characters (eg. > 0xFFFF). > + * > + * Note the ULL suffix allows shifting by 32 or more without integer > overflow. > + */ > +#define KMASK_ETHER 0x1FFFULL > +#define KMASK_IPV4 0xF0FFULL > +#define KMASK_UDP 0x000FULL > + > +#define PATTERN_IPV4_UDP_KMASK \ > + (KMASK_ETHER | (KMASK_IPV4 << 16) | (KMASK_UDP << 32)) > + > + > +/* This union allows initializing static data as u8, but easily loading it > + * into AVX512 registers too. The union ensures proper alignment for the zmm. > + */ > +union mfex_data { > + uint8_t u8_data[64]; > + __m512i zmm; > +}; > + > +/* This structure represents a single traffic pattern. The AVX512 code to > + * enable the specifics for each pattern is largely the same, so it is > + * specialized to use the common profile data from here. > + * > + * Due to the nature of e.g. TCP flag handling, or VLAN CFI bit setting, > + * some profiles require additional processing. This is handled by having > + * all implementations call a post-process function, and specializing away > + * the big switch() that handles all traffic types. > + * > + * This approach reduces AVX512 code-duplication for each traffic type. > + */ > +struct mfex_profile { > + /* Required for probing a packet with the mfex pattern. */ > + union mfex_data probe_mask; > + union mfex_data probe_data; > + > + /* Required for reshaping packet into miniflow. */ > + union mfex_data store_shuf; > + __mmask64 store_kmsk; > + > + /* Constant data to set in mf.bits and dp_packet data on hit. */ > + uint64_t mf_bits[2]; > + uint16_t dp_pkt_offs[4]; > + uint16_t dp_pkt_min_size; > +}; > + > +enum MFEX_PROFILES { > + PROFILE_ETH_IPV4_UDP, > + PROFILE_COUNT, > +}; > + > +/* Static const instances of profiles. These are compile-time constants, > + * and are specialized into individual miniflow-extract functions. > + */ > +static const struct mfex_profile mfex_profiles[PROFILE_COUNT] = > +{ > + [PROFILE_ETH_IPV4_UDP] = { > + .probe_mask.u8_data = { PATTERN_ETHERTYPE_MASK > PATTERN_IPV4_MASK }, > + .probe_data.u8_data = { PATTERN_ETHERTYPE_IPV4 > PATTERN_IPV4_UDP}, > + > + .store_shuf.u8_data = { PATTERN_IPV4_UDP_SHUFFLE }, > + .store_kmsk = PATTERN_IPV4_UDP_KMASK, > + > + .mf_bits = { 0x18a0000000000000, 0x0000000000040401}, > + .dp_pkt_offs = { > + 0, UINT16_MAX, 14, 34, > + }, > + .dp_pkt_min_size = 42, > + }, > +}; > + > + > +/* Protocol specific helper functions, for calculating offsets/lenghts. */ > +static int32_t > +mfex_ipv4_set_l2_pad_size(struct dp_packet *pkt, struct ip_header *nh, > + uint32_t len_from_ipv4) > +{ > + /* Handle dynamic l2_pad_size. */ > + uint16_t tot_len = ntohs(nh->ip_tot_len); > + if (OVS_UNLIKELY(tot_len > len_from_ipv4 || > + (len_from_ipv4 - tot_len) > UINT16_MAX)) { > + return -1; > + } > + dp_packet_set_l2_pad_size(pkt, len_from_ipv4 - tot_len); > + return 0; > +} > + > +/* Generic loop to process any mfex profile. This code is specialized into > + * multiple actual MFEX implementation functions. Its marked ALWAYS_INLINE > + * to ensure the compiler specializes each instance. The code is marked "hot" > + * to inform the compiler this is a hotspot in the program, encouraging > + * inlining of callee functions such as the permute calls. > + */ > +static inline uint32_t ALWAYS_INLINE > +__attribute__ ((hot)) > +mfex_avx512_process(struct dp_packet_batch *packets, > + struct netdev_flow_key *keys, > + uint32_t keys_size OVS_UNUSED, > + odp_port_t in_port, > + void *pmd_handle OVS_UNUSED, > + const enum MFEX_PROFILES profile_id, > + const uint32_t use_vbmi) > +{ > + uint32_t hitmask = 0; > + struct dp_packet *packet; > + > + /* Here the profile to use is chosen by the variable used to specialize > + * the function. This causes different MFEX traffic to be handled. > + */ > + const struct mfex_profile *profile = &mfex_profiles[profile_id]; > + > + /* Load profile constant data. */ > + __m512i v_vals = _mm512_loadu_si512(&profile->probe_data); > + __m512i v_mask = _mm512_loadu_si512(&profile->probe_mask); > + __m512i v_shuf = _mm512_loadu_si512(&profile->store_shuf); > + > + __mmask64 k_shuf = profile->store_kmsk; > + __m128i v_bits = _mm_loadu_si128((void *) &profile->mf_bits); > + uint16_t dp_pkt_min_size = profile->dp_pkt_min_size; > + > + __m128i v_zeros = _mm_setzero_si128(); > + __m128i v_blocks01 = _mm_insert_epi32(v_zeros, odp_to_u32(in_port), 1); > + > + DP_PACKET_BATCH_FOR_EACH (i, packet, packets) { > + /* If the packet is smaller than the probe size, skip it. */ > + const uint32_t size = dp_packet_size(packet); > + if (size < dp_pkt_min_size) { > + continue; > + } > + > + /* Load packet data and probe with AVX512 mask & compare. */ > + const uint8_t *pkt = dp_packet_data(packet); > + __m512i v_pkt0 = _mm512_loadu_si512(pkt); > + __m512i v_pkt0_masked = _mm512_and_si512(v_pkt0, v_mask); > + __mmask64 k_cmp = _mm512_cmpeq_epi8_mask(v_pkt0_masked, > v_vals); > + if (k_cmp != UINT64_MAX) { > + continue; > + } > + > + /* Copy known dp packet offsets to the dp_packet instance. */ > + memcpy(&packet->l2_pad_size, &profile->dp_pkt_offs, > + sizeof(uint16_t) * 4); > + > + /* Store known miniflow bits and first two blocks. */ > + struct miniflow *mf = &keys[i].mf; > + uint64_t *bits = (void *) &mf->map.bits[0]; > + uint64_t *blocks = miniflow_values(mf); > + _mm_storeu_si128((void *) bits, v_bits); > + _mm_storeu_si128((void *) blocks, v_blocks01); > + > + /* Permute the packet layout into miniflow blocks shape. > + * As different avx512 ISA levels have different implementations, Minor, but would capitalize AVX512 above, seems to be standard the way it is referred to in other comments. > + * this specializes on the "use_vbmi" attribute passed in. > + */ > + __m512i v512_zeros = _mm512_setzero_si512(); > + __m512i v_blk0 = v512_zeros; > + if (__builtin_constant_p(use_vbmi) && use_vbmi) { > + v_blk0 = _mm512_maskz_permutexvar_epi8_wrap(k_shuf, v_shuf, > + v_pkt0); > + } else { > + v_blk0 = _mm512_maskz_permutex2var_epi8_skx(k_shuf, v_pkt0, > + v_shuf, v512_zeros); > + } > + _mm512_storeu_si512(&blocks[2], v_blk0); > + > + > + /* Perform "post-processing" per profile, handling details not easily > + * handled in the above generic AVX512 code. Examples include TCP > flag > + * parsing, adding the VLAN CFI bit, and handling IPv4 fragments. > + */ > + switch (profile_id) { > + case PROFILE_COUNT: > + ovs_assert(0); /* avoid compiler warning on missing ENUM */ > + break; > + > + case PROFILE_ETH_IPV4_UDP: { > + /* Handle dynamic l2_pad_size. */ > + uint32_t size_from_ipv4 = size - sizeof(struct eth_header); > + struct ip_header *nh = (void *)&pkt[sizeof(struct > eth_header)]; > + if (mfex_ipv4_set_l2_pad_size(packet, nh, size_from_ipv4)) { > + continue; > + } > + > + } break; > + default: > + break; > + }; > + > + /* This packet has its miniflow created, add to hitmask. */ > + hitmask |= 1 << i; > + } > + > + return hitmask; > +} > + > + > +#define DECLARE_MFEX_FUNC(name, profile) \ > +uint32_t \ > +__attribute__((__target__("avx512f"))) \ > +__attribute__((__target__("avx512vl"))) \ > +__attribute__((__target__("avx512vbmi"))) \ > +mfex_avx512_vbmi_##name(struct dp_packet_batch *packets, \ > + struct netdev_flow_key *keys, uint32_t keys_size, \ > + odp_port_t in_port, void *pmd_handle) \ > +{ \ > + return mfex_avx512_process(packets, keys, keys_size, in_port, \ > + pmd_handle, profile, 1); \ > +} \ > + \ > +uint32_t \ > +__attribute__((__target__("avx512f"))) \ > +__attribute__((__target__("avx512vl"))) \ > +mfex_avx512_##name(struct dp_packet_batch *packets, \ > + struct netdev_flow_key *keys, uint32_t keys_size, \ > + odp_port_t in_port, void *pmd_handle) \ > +{ \ > + return mfex_avx512_process(packets, keys, keys_size, in_port, \ > + pmd_handle, profile, 0); \ > +} > + > +/* Each profile gets a single declare here, which specializes the function > + * as required. > + */ > +DECLARE_MFEX_FUNC(ip_udp,PROFILE_ETH_IPV4_UDP) > + > + > +static int32_t > +avx512_isa_probe(uint32_t needs_vbmi) > +{ > + static const char *isa_required[] = { > + "avx512f", > + "avx512bw", > + "bmi2", > + }; > + > + int32_t ret = 0; > + for (uint32_t i = 0; i < ARRAY_SIZE(isa_required); i++) { > + if (!dpdk_get_cpu_has_isa("x86_64", isa_required[i])) { > + ret = -ENOTSUP; > + } > + } > + > + if (needs_vbmi) { > + if (!dpdk_get_cpu_has_isa("x86_64", "avx512vbmi")) { > + ret = -ENOTSUP; > + } > + } > + > + return ret; > +} > + > +/* Probe functions to check ISA requirements. */ > +int32_t > +mfex_avx512_probe(void) > +{ > + const uint32_t needs_vbmi = 0; > + return avx512_isa_probe(needs_vbmi); > +} > + > +int32_t > +mfex_avx512_vbmi_probe(void) > +{ > + const uint32_t needs_vbmi = 1; > + return avx512_isa_probe(needs_vbmi); > +} > + > +#endif /* __CHECKER__ */ > +#endif /* __x86_64__ */ > diff --git a/lib/dpif-netdev-private-extract.c > b/lib/dpif-netdev-private-extract.c > index 2008e5ee5..106a83867 100644 > --- a/lib/dpif-netdev-private-extract.c > +++ b/lib/dpif-netdev-private-extract.c > @@ -47,8 +47,23 @@ static struct dpif_miniflow_extract_impl mfex_impls[] = { > .extract_func = mfex_study_traffic, > .name = "study", > }, > + > +/* Compile in implementations only if the compiler ISA checks pass. */ > +#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && > __SSE4_2__) > + { > + .probe = mfex_avx512_vbmi_probe, > + .extract_func = mfex_avx512_vbmi_ip_udp, > + .name = "avx512_vbmi_ipv4_udp", > + }, > + { > + .probe = mfex_avx512_probe, > + .extract_func = mfex_avx512_ip_udp, > + .name = "avx512_ipv4_udp", > + }, > +#endif > }; > > + > BUILD_ASSERT_DECL(MFEX_IMPLS_MAX_SIZE > ARRAY_SIZE(mfex_impls)); > > int32_t > diff --git a/lib/dpif-netdev-private-extract.h > b/lib/dpif-netdev-private-extract.h > index 0ec74bef9..f32be202a 100644 > --- a/lib/dpif-netdev-private-extract.h > +++ b/lib/dpif-netdev-private-extract.h > @@ -136,4 +136,23 @@ > dpif_miniflow_extract_set_default(miniflow_extract_func func); > uint32_t mfex_set_study_pkt_cnt(uint32_t pkt_cmp_count, > struct dpif_miniflow_extract_impl *opt); > > +/* AVX512 MFEX Probe and Implementations functions. */ > +#ifdef __x86_64__ > +int32_t mfex_avx512_probe(void); > +int32_t mfex_avx512_vbmi_probe(void); > + > +#define DECLARE_AVX512_MFEX_PROTOTYPE(name) \ > + uint32_t \ > + mfex_avx512_vbmi_##name(struct dp_packet_batch *packets, \ > + struct netdev_flow_key *keys, uint32_t keys_size, \ > + odp_port_t in_port, void *pmd_handle); \ > + uint32_t \ > + mfex_avx512_##name(struct dp_packet_batch *packets, \ > + struct netdev_flow_key *keys, uint32_t keys_size, \ > + odp_port_t in_port, void *pmd_handle); > + > +DECLARE_AVX512_MFEX_PROTOTYPE(ip_udp); > +#endif /* __x86_64__ */ > + > + > #endif /* DPIF_NETDEV_AVX512_EXTRACT */ > -- > 2.25.1 > > _______________________________________________ > dev mailing list > d...@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev