>  static inline uint32_t
>  __rte_raw_cksum(const void *buf, size_t len, uint32_t sum)
>  {
> -     const void *end;
> -
> -     for (end = RTE_PTR_ADD(buf, RTE_ALIGN_FLOOR(len,
> sizeof(uint16_t)));
> -          buf != end; buf = RTE_PTR_ADD(buf, sizeof(uint16_t))) {
> -             uint16_t v;
> -
> -             memcpy(&v, buf, sizeof(uint16_t));
> -             sum += v;
> -     }
> +     /* Process uint16 chunks to preserve overflow/carry math.
> GCC/Clang vectorize the loop. */
> +     const unaligned_uint16_t *buf16 = (const unaligned_uint16_t
> *)buf;
> +     const unaligned_uint16_t *end = buf16 + (len / sizeof(uint16_t));
> +     for (; buf16 != end; buf16++)
> +             sum += *buf16;

Here are some more thoughts about loop unroll...
In another mail [1], you are discussing manual loop unroll for 
rte_ipv4/ipv6_phdr_cksum().
Perhaps the compiler already loop unrolls those.
Check the assembler output for the existing code calling __rte_raw_cksum().
If the compiler doesn't loop unroll __rte_raw_cksum() for those two functions, 
maybe you can help it by modifying __rte_raw_cksum(); try replacing the end 
pointer with an int counter, which will be compile time constant when called by 
rte_ipv4/ipv6_phdr_cksum().

[1]: 
https://inbox.dpdk.org/dev/CAFn2buA5NzmzA0+t1_5auigvQTyT7Ne6RMVaPVU=sdc03nd...@mail.gmail.com/

PS: I do the following when optimizing inline functions: Add non-inline 
functions calling the inline functions, and then use "objdump -S" to look at 
the generated code. E.g.:

uint32_t review__rte_raw_cksum(const void *buf, size_t len, uint32_t sum)
{ return __rte_raw_cksum(buf, len, sum); }

uint32_t review__rte_raw_cksum_len20(const void *buf, uint32_t sum)
{ return __rte_raw_cksum(buf, 20, sum); }

uint32_t review__rte_raw_cksum_len8(const void *buf, uint32_t sum)
{ return __rte_raw_cksum(buf, 8, sum); }

> 
>       /* if length is odd, keeping it byte order independent */
> -     if (unlikely(len % 2)) {
> +     if (len & 1) {
>               uint16_t left = 0;
> -
>               memcpy(&left, end, 1);
>               sum += left;
>       }
> diff --git a/lib/net/rte_ip4.h b/lib/net/rte_ip4.h
> index 822a660cfb..63852717c9 100644
> --- a/lib/net/rte_ip4.h
> +++ b/lib/net/rte_ip4.h
> @@ -223,21 +223,17 @@ rte_ipv4_phdr_cksum(const struct rte_ipv4_hdr
> *ipv4_hdr, uint64_t ol_flags)
>               uint8_t  zero;     /* zero. */
>               uint8_t  proto;    /* L4 protocol type. */
>               uint16_t len;      /* L4 length. */
> -     } psd_hdr;
> -
> -     uint32_t l3_len;
> -
> -     psd_hdr.src_addr = ipv4_hdr->src_addr;
> -     psd_hdr.dst_addr = ipv4_hdr->dst_addr;
> -     psd_hdr.zero = 0;
> -     psd_hdr.proto = ipv4_hdr->next_proto_id;
> -     if (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG)) {
> -             psd_hdr.len = 0;
> -     } else {
> -             l3_len = rte_be_to_cpu_16(ipv4_hdr->total_length);
> -             psd_hdr.len = rte_cpu_to_be_16((uint16_t)(l3_len -
> -                     rte_ipv4_hdr_len(ipv4_hdr)));
> -     }
> +     } psd_hdr = {
> +             .src_addr = ipv4_hdr->src_addr,
> +             .dst_addr = ipv4_hdr->dst_addr,
> +             .proto = ipv4_hdr->next_proto_id,
> +             .len = (ol_flags & (RTE_MBUF_F_TX_TCP_SEG |
> RTE_MBUF_F_TX_UDP_SEG))
> +                     ? (uint16_t)0
> +                     :
> rte_cpu_to_be_16((uint16_t)(rte_be_to_cpu_16(ipv4_hdr->total_length) -
> +                                     rte_ipv4_hdr_len(ipv4_hdr)))
> +     };
> +     RTE_SUPPRESS_UNINITIALIZED_WARNING(psd_hdr);
> +
>       return rte_raw_cksum(&psd_hdr, sizeof(psd_hdr));
>  }
> 
> diff --git a/lib/net/rte_ip6.h b/lib/net/rte_ip6.h
> index d1abf1f5d5..8a7e5e4b8a 100644
> --- a/lib/net/rte_ip6.h
> +++ b/lib/net/rte_ip6.h
> @@ -560,19 +560,18 @@ rte_ipv6_phdr_cksum(const struct rte_ipv6_hdr
> *ipv6_hdr, uint64_t ol_flags)
>  static inline uint16_t
>  rte_ipv6_phdr_cksum(const struct rte_ipv6_hdr *ipv6_hdr, uint64_t
> ol_flags)
>  {
> -     uint32_t sum;
>       struct {
>               rte_be32_t len;   /* L4 length. */
>               rte_be32_t proto; /* L4 protocol - top 3 bytes must be zero
> */
> -     } psd_hdr;
> -
> -     psd_hdr.proto = (uint32_t)(ipv6_hdr->proto << 24);
> -     if (ol_flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG))
> -             psd_hdr.len = 0;
> -     else
> -             psd_hdr.len = ipv6_hdr->payload_len;
> +     } psd_hdr = {
> +             .len = (ol_flags & (RTE_MBUF_F_TX_TCP_SEG |
> RTE_MBUF_F_TX_UDP_SEG))
> +                     ? (rte_be32_t)0
> +                     : ipv6_hdr->payload_len,
> +             .proto = (uint32_t)(ipv6_hdr->proto << 24)
> +     };
> +     RTE_SUPPRESS_UNINITIALIZED_WARNING(psd_hdr);
> 
> -     sum = __rte_raw_cksum(&ipv6_hdr->src_addr,
> +     uint32_t sum = __rte_raw_cksum(&ipv6_hdr->src_addr,
>               sizeof(ipv6_hdr->src_addr) + sizeof(ipv6_hdr->dst_addr),
>               0);
>       sum = __rte_raw_cksum(&psd_hdr, sizeof(psd_hdr), sum);
> --
> 2.39.5 (Apple Git-154)

Reply via email to