On Thu, Jun 18, 2020 at 9:53 AM Harry van Haaren <harry.van.haa...@intel.com> wrote: > > This commit adds an AVX-512 dpcls lookup implementation. > It uses the AVX-512 SIMD ISA to perform multiple miniflow > operations in parallel. > > To run this implementation, the "avx512f" and "bmi2" ISAs are > required. These ISA checks are performed at runtime while > probing the subtable implementation. If a CPU does not provide > both "avx512f" and "bmi2", then this code does not execute. > > The avx512 code is built as a seperate static library, with added > CFLAGS to enable the required ISA features. By building only this > static library with avx512 enabled, it is ensured that the main OVS > core library is *not* using avx512, and that OVS continues to run > as before on CPUs that do not support avx512. > > The approach taken in this implementation is to use the > gather instruction to access the packet miniflow, allowing > any miniflow blocks to be loaded into an AVX-512 register. > This maximises the usefulness of the register, and hence this > implementation handles any subtable with up to miniflow 8 bits. > > Note that specialization of these avx512 lookup routines > still provides performance value, as the hashing of the > resulting data is performed in scalar code, and compile-time > loop unrolling occurs when specialized to miniflow bits. > > Signed-off-by: Harry van Haaren <harry.van.haa...@intel.com> > > --- > > v4: > - Remove TODO comment on prio-set command (was accidentally > added to this commit in v3) > - Fixup v3 changlog to not include #warning comment (William Tu) > - Remove #define for debugging in lookup.h > - Fix builds on older gcc versions that don't support -mavx512f. > Solution involves CC_CHECK and #ifdefs in code (OVS Robot, William Tu) > > v3: > - Improve function name for _any subtable lookup > - Use "" include not <> for immintrin.h > - Add checks for SSE42 instructions in core OVS for CRC32 based hashing > If not available, disable AVX512 lookup implementation as it requires > uses CRC32 for hashing, and the hashing algorithm must match core OVS. > - Rework ovs_asserts() into function selection time check > - Add #define for magic number 8, number of u64 blocks in AVX512 register > - Add #if CHECKER around AVX code, sparse doesn't like checking it > - Simplify avx512 enabled building, fixes builds with --enable-shared > --- > configure.ac | 2 + > lib/automake.mk | 17 ++ > lib/dpif-netdev-lookup-avx512-gather.c | 265 +++++++++++++++++++++++++ > lib/dpif-netdev-lookup.c | 17 ++ > lib/dpif-netdev-lookup.h | 4 + > 5 files changed, 305 insertions(+) > create mode 100644 lib/dpif-netdev-lookup-avx512-gather.c > > diff --git a/configure.ac b/configure.ac > index 81893e56e..1367c868b 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -178,6 +178,8 @@ OVS_ENABLE_OPTION([-Wno-null-pointer-arithmetic]) > OVS_ENABLE_OPTION([-Warray-bounds-pointer-arithmetic]) > OVS_CONDITIONAL_CC_OPTION([-Wno-unused], [HAVE_WNO_UNUSED]) > OVS_CONDITIONAL_CC_OPTION([-Wno-unused-parameter], > [HAVE_WNO_UNUSED_PARAMETER]) > +OVS_CONDITIONAL_CC_OPTION([-mavx512f], [HAVE_AVX512F]) > +OVS_CHECK_CC_OPTION([-mavx512f], [CFLAGS="$CFLAGS -DHAVE_AVX512F"])
Do you need both checks? I thought the first one OVS_CONDITIONAL_CC_OPTION([-mavx512f], [HAVE_AVX512F]) is good enough since at lib/automake.mk, you add the -mavx512f to CFLAGS. > OVS_ENABLE_WERROR > OVS_ENABLE_SPARSE > OVS_CTAGS_IDENTIFIERS > diff --git a/lib/automake.mk b/lib/automake.mk > index 1fc1a209e..fab056b8a 100644 > --- a/lib/automake.mk > +++ b/lib/automake.mk > @@ -11,6 +11,7 @@ lib_libopenvswitch_la_LIBADD = $(SSL_LIBS) > lib_libopenvswitch_la_LIBADD += $(CAPNG_LDADD) > lib_libopenvswitch_la_LIBADD += $(LIBBPF_LDADD) > > + > if WIN32 > lib_libopenvswitch_la_LIBADD += ${PTHREAD_LIBS} > endif > @@ -20,6 +21,22 @@ lib_libopenvswitch_la_LDFLAGS = \ > -Wl,--version-script=$(top_builddir)/lib/libopenvswitch.sym \ > $(AM_LDFLAGS) > > +if HAVE_AVX512F > +# Build library of avx512 code with CPU ISA CFLAGS enabled. This allows the > +# compiler to use the ISA features required for the ISA optimized code-paths. > +lib_LTLIBRARIES += lib/libopenvswitchavx512.la > +lib_libopenvswitch_la_LIBADD += lib/libopenvswitchavx512.la > +lib_libopenvswitchavx512_la_CFLAGS = \ > + -mavx512f \ > + -mavx512bw \ > + -mavx512dq \ > + -mbmi2 \ > + $(AM_CFLAGS) > +lib_libopenvswitchavx512_la_SOURCES = \ > + lib/dpif-netdev-lookup-avx512-gather.c > +endif > + > +# Build core vswitch libraries as before > lib_libopenvswitch_la_SOURCES = \ > lib/aes128.c \ > lib/aes128.h \ > diff --git a/lib/dpif-netdev-lookup-avx512-gather.c > b/lib/dpif-netdev-lookup-avx512-gather.c > new file mode 100644 > index 000000000..754cd0e3c > --- /dev/null > +++ b/lib/dpif-netdev-lookup-avx512-gather.c > @@ -0,0 +1,265 @@ > +/* > + * Copyright (c) 2020, Intel Corperation. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#ifdef __x86_64__ > +#if !defined(__CHECKER__) > + > +#include <config.h> > + > +#include "dpif-netdev.h" > +#include "dpif-netdev-lookup.h" > +#include "dpif-netdev-private.h" > +#include "cmap.h" > +#include "flow.h" > +#include "pvector.h" > +#include "openvswitch/vlog.h" > + > +#include "immintrin.h" > + > +/* Each AVX512 register (zmm register in assembly notation) can contain up to > + * 512 bits, which is equivelent to 8 uint64_t variables. This is the maximum typo: equivalent > + * number of miniflow blocks that can be processed in a single pass of the > + * AVX512 code at a time. > + */ > +#define NUM_U64_IN_ZMM_REG (8) > +#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG) > + > + > +VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather); > + > +static inline __m512i > +_mm512_popcnt_epi64_manual(__m512i v_in) > +{ > + static const uint8_t pop_lut[64] = { > + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, > + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, > + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, > + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, > + }; > + __m512i v_pop_lut = _mm512_loadu_si512(pop_lut); > + > + __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4); > + __m512i v_nibble_mask = _mm512_set1_epi8(0xF); > + __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask); > + __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask); > + > + __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo); > + __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi); > + __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop); > + > + return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512()); > +} I forgot whether you mentioned this or not. But why create this manual popcnt? Isn't there a _mm512_popcnt_* in the library? The rest looks good to me, Thanks William _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev