On 24 Nov 2022, at 10:30, Emma Finn wrote:
> This commit adds support for the AVX512 implementation of the
> ipv6_set_addrs action as well as an AVX512 implementation of
> updating the L4 checksums.
>
> Signed-off-by: Emma Finn <emma.f...@intel.com>
Thanks Emma for the v4, I have one question and a couple of style issues. To
speed things up I just provide the diff for the style issues.
I was not able to do any actual testing, as my system did not have the
avx512vbmi extension :(
Cheers,
Eelco
> ---
Style issues diff:
diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 82ff7e647..f798d6708 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -20,9 +20,9 @@
#include <config.h>
#include <errno.h>
-#include <sys/types.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
+#include <sys/types.h>
#include "csum.h"
#include "dp-packet.h"
@@ -547,8 +547,8 @@ avx512_ipv6_sum_header(__m512i ip6_header)
* horizontal add. */
__m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
0xF, 0xF, 0xF, 0xF);
- v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
+ v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
@@ -562,7 +562,7 @@ avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i
new_header)
{
uint16_t old_delta = avx512_ipv6_sum_header(old_header);
uint16_t new_delta = avx512_ipv6_sum_header(new_header);
- uint32_t csum_delta = (uint16_t)~old_delta + new_delta;
+ uint32_t csum_delta = ((uint16_t) ~old_delta) + new_delta;
return ~csum_finish(csum_delta);
}
@@ -606,15 +606,15 @@ action_avx512_ipv6_set_addrs(struct dp_packet_batch
*batch,
__m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
__m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask);
-
/* Set the v_zero register to all zero's. */
const __m128i v_zeros = _mm_setzero_si128();
+
/* Set the v_all_ones register to all one's. */
const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros);
- /* Load ip6 src and dst respectively into 128-bit wide registers. */
+ /* Load ip6 src and dst masks respectively into 128-bit wide registers. */
__m128i v_src = _mm_loadu_si128((void *) mask);
- __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask);
+ __m128i v_dst = _mm_maskz_loadu_epi64(0xC, (void *) mask);
/* Perform a bitwise OR between src and dst registers. */
__m128i v_or = _mm_or_si128(v_src, v_dst);
> v4:
> - Reworked and moved check for checksum outside loop.
> - Code cleanup based on review from Eelco.
> v3:
> - Added a runtime check for AVX512 vbmi.
> v2:
> - Added check for availbility of s6_addr32 field of struct in6_addr.
> - Fixed network headers for freebsd builds.
>
<SNIP>
> + /* Load ip6 src and dst respectively into 128-bit wide registers. */
> + __m128i v_src = _mm_loadu_si128((void *) mask);
> + __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask);
Guess it might be me, but I do not understand how _mm_maskz_loadu_epi64() will
load the dst from the mask.
Looking at the intrinsics guide it will only read the first two 64-bit values,
but mask points to src?
Should we not just do the following here?
+ __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src);
+ __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst);
> +
> + /* Perform a bitwise OR between src and dst registers. */
> + __m128i v_or = _mm_or_si128(v_src, v_dst);
> +
> + /* Will return true if any bit has been set in v_or, else it will return
> + * false. */
> + bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones);
> +
> + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> + struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
> +
> + /* Load the 40 bytes of the IPv6 header. */
> + __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh);
> +
> + /* AND the v_pkt_mask to the packet data (v_packet). */
> + __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet);
> +
> + /* OR the new addresses (v_key_shuf) with the masked packet addresses
> + * (v_pkt_masked). */
> + __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked);
> +
> + /* If ip6_src or ip6_dst has been modified, L4 checksum needs to
> + * be updated. */
> + if (do_checksum) {
> + uint8_t proto = nh->ip6_nxt;
> + uint16_t delta_checksum = avx512_ipv6_addr_csum_delta(v_packet,
> + v_new_hdr);
> +
> + if (proto == IPPROTO_UDP) {
> + struct udp_header *uh = dp_packet_l4(packet);
> +
> + if (uh->udp_csum) {
> + uint16_t old_udp_checksum = ~uh->udp_csum;
> + uint32_t udp_checksum = old_udp_checksum +
> delta_checksum;
> +
> + udp_checksum = csum_finish(udp_checksum);
> +
> + if (!udp_checksum) {
> + udp_checksum = htons(0xffff);
> + }
> +
> + uh->udp_csum = udp_checksum;
> + }
> + } else if (proto == IPPROTO_TCP) {
> + struct tcp_header *th = dp_packet_l4(packet);
> + uint16_t old_tcp_checksum = ~th->tcp_csum;
> + uint32_t tcp_checksum = old_tcp_checksum + delta_checksum;
> +
> + tcp_checksum = csum_finish(tcp_checksum);
> + th->tcp_csum = tcp_checksum;
> + } else if (proto == IPPROTO_ICMPV6) {
> + struct icmp6_header *icmp = dp_packet_l4(packet);
> + uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum;
> + uint32_t icmp6_checksum = old_icmp6_checksum +
> delta_checksum;
> +
> + icmp6_checksum = csum_finish(icmp6_checksum);
> + icmp->icmp6_cksum = icmp6_checksum;
> + }
> + }
> + /* Write back the modified IPv6 addresses. */
> + _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
> + }
> +}
> +#endif /* HAVE_AVX512VBMI */
> +
> static void
> action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr
> *a)
> {
> @@ -514,6 +711,13 @@ action_avx512_init(struct odp_execute_action_impl *self
> OVS_UNUSED)
> impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
> action_avx512_eth_set_addrs;
> impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs;
>
> +#if HAVE_AVX512VBMI
> + if (action_avx512vbmi_isa_probe()) {
> + impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] =
> + action_avx512_ipv6_set_addrs;
> + }
> +#endif
> +
> return 0;
> }
>
> diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> index f80ae5a23..8b86b1e4f 100644
> --- a/lib/odp-execute-private.c
> +++ b/lib/odp-execute-private.c
> @@ -60,6 +60,23 @@ action_avx512_isa_probe(void)
>
> #endif
>
> +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI
> +bool
> +action_avx512vbmi_isa_probe(void)
> +{
> + if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
> + return true;
> + }
> + return false;
> +}
> +#else
> +bool
> +action_avx512vbmi_isa_probe(void)
> +{
> + return false;
> +}
> +#endif
> +
> static struct odp_execute_action_impl action_impls[] = {
> [ACTION_IMPL_AUTOVALIDATOR] = {
> .available = false,
> diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
> index 940180c99..643f41c2a 100644
> --- a/lib/odp-execute-private.h
> +++ b/lib/odp-execute-private.h
> @@ -78,6 +78,7 @@ BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
> #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1)
>
> bool action_avx512_isa_probe(void);
> +bool action_avx512vbmi_isa_probe(void);
>
> /* Odp execute init handles setting up the state of the actions functions at
> * initialization time. It cannot return errors, as it must always succeed in
> --
> 2.25.1
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev