On Tue, Mar 4, 2025 at 8:29 AM David Marchand <[email protected]> wrote:
>
> As the virtio-net offload API is used for netdev-linux ports, but
> provides no information about the potentially encapsulated protocol
> concerned by a checksum request, specific informations from this netdev-
> specific implementation is propagated into OVS code, and must be
> carefully evaluated when some tunnel gets decapsulated.
>
> This induces a cost in "normal" processing path, while the netdev-linux
> path is not performance critical.
>
> This patch removes such specific information, yet try harder to parse
> the packet on the Rx side and set offload flags accordingly for non
> encapsulated traffic.
> If the parsing fails, resolve the checksum right away and
> mark the packet as good.
>
> Signed-off-by: David Marchand <[email protected]>
> ---
> Changes since v1:
> - fixed patch reordering issue: offloads fields was being added instead
> of later in this series,
>
> ---
> lib/dp-packet.c | 3 -
> lib/dp-packet.h | 24 --------
> lib/dpif-netdev-extract-avx512.c | 2 -
> lib/flow.c | 3 -
> lib/netdev-linux.c | 97 ++++++++++++++++++++++++--------
> lib/netdev-native-tnl.c | 6 +-
> 6 files changed, 75 insertions(+), 60 deletions(-)
>
> diff --git a/lib/dp-packet.c b/lib/dp-packet.c
> index dad0d7be3a..6a9bfd63ba 100644
> --- a/lib/dp-packet.c
> +++ b/lib/dp-packet.c
> @@ -39,9 +39,6 @@ dp_packet_init__(struct dp_packet *b, size_t allocated,
> enum dp_packet_source so
> dp_packet_init_specific(b);
> /* By default assume the packet type to be Ethernet. */
> b->packet_type = htonl(PT_ETH);
> - /* Reset csum start and offset. */
> - b->csum_start = 0;
> - b->csum_offset = 0;
> }
>
> static void
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index 549802167d..084e89bd99 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -178,8 +178,6 @@ struct dp_packet {
> or UINT16_MAX. */
> uint32_t cutlen; /* length in bytes to cut from the end. */
> ovs_be32 packet_type; /* Packet type as defined in OpenFlow */
> - uint16_t csum_start; /* Position to start checksumming from. */
> - uint16_t csum_offset; /* Offset to place checksum. */
> union {
> struct pkt_metadata md;
> uint64_t data[DP_PACKET_CONTEXT_SIZE / 8];
> @@ -1539,34 +1537,12 @@ dp_packet_ol_set_l4_csum_bad(struct dp_packet *p)
> *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_L4_CKSUM_BAD;
> }
>
> -/* Marks packet 'p' with good integrity if checksum offload locations
> - * were provided. In the case of encapsulated packets, these values may
> - * be deeper into the packet than OVS might expect. But the packet
> - * should still be considered to have good integrity.
> - * The 'csum_start' is the offset from the begin of the packet headers.
> - * The 'csum_offset' is the offset from start to place the checksum.
> - * The csum_start and csum_offset fields are set from the virtio_net_hdr
> - * struct that may be provided by a netdev on packet ingress. */
> -static inline void
> -dp_packet_ol_l4_csum_check_partial(struct dp_packet *p)
> -{
> - if (p->csum_start && p->csum_offset) {
> - dp_packet_ol_set_l4_csum_partial(p);
> - }
> -}
> -
> static inline void
> dp_packet_reset_packet(struct dp_packet *b, int off)
> {
> dp_packet_set_size(b, dp_packet_size(b) - off);
> dp_packet_set_data(b, ((unsigned char *) dp_packet_data(b) + off));
> dp_packet_reset_offsets(b);
> -
> - if (b->csum_start >= off && b->csum_offset) {
> - /* Adjust values for decapsulation. */
> - b->csum_start -= off;
> - dp_packet_ol_set_l4_csum_partial(b);
> - }
> }
>
> static inline uint32_t ALWAYS_INLINE
> diff --git a/lib/dpif-netdev-extract-avx512.c
> b/lib/dpif-netdev-extract-avx512.c
> index 57ca4c71b7..3ae850c2d5 100644
> --- a/lib/dpif-netdev-extract-avx512.c
> +++ b/lib/dpif-netdev-extract-avx512.c
> @@ -776,7 +776,6 @@ mfex_ipv6_set_hwol(struct dp_packet *pkt)
> static void
> mfex_tcp_set_hwol(struct dp_packet *pkt)
> {
> - dp_packet_ol_l4_csum_check_partial(pkt);
> if (dp_packet_l4_checksum_good(pkt)
> || dp_packet_ol_l4_csum_partial(pkt)) {
> dp_packet_hwol_set_csum_tcp(pkt);
> @@ -786,7 +785,6 @@ mfex_tcp_set_hwol(struct dp_packet *pkt)
> static void
> mfex_udp_set_hwol(struct dp_packet *pkt)
> {
> - dp_packet_ol_l4_csum_check_partial(pkt);
> if (dp_packet_l4_checksum_good(pkt)
> || dp_packet_ol_l4_csum_partial(pkt)) {
> dp_packet_hwol_set_csum_udp(pkt);
> diff --git a/lib/flow.c b/lib/flow.c
> index 61e732aff8..80b76d8f87 100644
> --- a/lib/flow.c
> +++ b/lib/flow.c
> @@ -1080,7 +1080,6 @@ miniflow_extract(struct dp_packet *packet, struct
> miniflow *dst)
> } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
> }
> - dp_packet_ol_l4_csum_check_partial(packet);
> if (dp_packet_l4_checksum_good(packet)
> || dp_packet_ol_l4_csum_partial(packet)) {
> dp_packet_hwol_set_csum_tcp(packet);
> @@ -1100,7 +1099,6 @@ miniflow_extract(struct dp_packet *packet, struct
> miniflow *dst)
> } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
> }
> - dp_packet_ol_l4_csum_check_partial(packet);
> if (dp_packet_l4_checksum_good(packet)
> || dp_packet_ol_l4_csum_partial(packet)) {
> if (tunneling) {
> @@ -1118,7 +1116,6 @@ miniflow_extract(struct dp_packet *packet, struct
> miniflow *dst)
> miniflow_push_be16(mf, tp_dst, sctp->sctp_dst);
> miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
> miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
> - dp_packet_ol_l4_csum_check_partial(packet);
> if (dp_packet_l4_checksum_good(packet)
> || dp_packet_ol_l4_csum_partial(packet)) {
> dp_packet_hwol_set_csum_sctp(packet);
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index a89f7b8a21..49ed25cbee 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -89,6 +89,7 @@ COVERAGE_DEFINE(netdev_get_hwaddr);
> COVERAGE_DEFINE(netdev_set_hwaddr);
> COVERAGE_DEFINE(netdev_get_ethtool);
> COVERAGE_DEFINE(netdev_set_ethtool);
> +COVERAGE_DEFINE(netdev_linux_unknown_l4_csum);
>
>
> #ifndef IFLA_IF_NETNSID
> @@ -7039,49 +7040,64 @@ af_packet_sock(void)
> }
>
> static int
> -netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
> +netdev_linux_parse_packet(struct dp_packet *b, uint16_t *l2_len,
> + uint16_t *l3_len, uint16_t *l4proto)
> {
> struct eth_header *eth_hdr;
> ovs_be16 eth_type;
> - int l2_len;
>
> eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
> if (!eth_hdr) {
> return -EINVAL;
> }
>
> - l2_len = ETH_HEADER_LEN;
> + *l2_len = ETH_HEADER_LEN;
> eth_type = eth_hdr->eth_type;
> if (eth_type_vlan(eth_type)) {
> - struct vlan_header *vlan = dp_packet_at(b, l2_len, VLAN_HEADER_LEN);
> + struct vlan_header *vlan = dp_packet_at(b, *l2_len, VLAN_HEADER_LEN);
>
> if (!vlan) {
> return -EINVAL;
> }
>
> eth_type = vlan->vlan_next_type;
> - l2_len += VLAN_HEADER_LEN;
> + *l2_len += VLAN_HEADER_LEN;
> }
>
> if (eth_type == htons(ETH_TYPE_IP)) {
> - struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
> + struct ip_header *ip_hdr = dp_packet_at(b, *l2_len, IP_HEADER_LEN);
>
> if (!ip_hdr) {
> return -EINVAL;
> }
>
> + *l3_len = IP_IHL(ip_hdr->ip_ihl_ver) * 4;
> *l4proto = ip_hdr->ip_proto;
> dp_packet_hwol_set_tx_ipv4(b);
> } else if (eth_type == htons(ETH_TYPE_IPV6)) {
> struct ovs_16aligned_ip6_hdr *nh6;
> + const void *data;
> + uint8_t nw_proto;
> + uint8_t nw_frag;
> + size_t size;
>
> - nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN);
> + nh6 = dp_packet_at(b, *l2_len, IPV6_HEADER_LEN);
> if (!nh6) {
> return -EINVAL;
> }
>
> - *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
> + nw_proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
> + data = (const char *) nh6 + sizeof *nh6;
> + size = (const char *) dp_packet_tail(b) - (const char *) data;
> + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag,
> + NULL, NULL)) {
> + return -EINVAL;
> + }
> + *l3_len = (const char *) data - (const char *) nh6;
> + *l4proto = nw_proto;
> dp_packet_hwol_set_tx_ipv6(b);
> + } else {
> + *l3_len = *l4proto = 0;
> }
>
> return 0;
> @@ -7104,25 +7120,60 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b)
> }
>
> if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> - uint16_t l4proto = 0;
> -
> - if (netdev_linux_parse_l2(b, &l4proto)) {
> + uint16_t csum_offset = (OVS_FORCE uint16_t) vnet->csum_offset;
> + uint16_t csum_start = (OVS_FORCE uint16_t) vnet->csum_start;
> + bool l4_csum_partial = false;
> + uint16_t l4proto;
> + uint16_t l2_len;
> + uint16_t l3_len;
> +
> + if (netdev_linux_parse_packet(b, &l2_len, &l3_len, &l4proto)) {
> return EINVAL;
> }
>
> - if (l4proto == IPPROTO_UDP) {
> - dp_packet_hwol_set_csum_udp(b);
> + if (csum_start && csum_offset && csum_start == l2_len + l3_len) {
Hello David,
Still going through the patches, but one of the concerns I have is
I've previously encountered issues where the csum_start/offset
delivered via AF_PACKET was incorrect in the case of vlan+vxlan
packets. This is one of the reasons why OVS currently doesn't validate
these values.
I think it's reasonable to assume dp_packet_ol_set_l4_csum_partial if
these values are set.
> + if (csum_offset == offsetof(struct tcp_header, tcp_csum)
> + && l4proto == IPPROTO_TCP) {
> + dp_packet_hwol_set_csum_tcp(b);
> + l4_csum_partial = true;
> + } else if (csum_offset == offsetof(struct udp_header, udp_csum)
> + && l4proto == IPPROTO_UDP) {
> + dp_packet_hwol_set_csum_udp(b);
> + l4_csum_partial = true;
> + } else if (csum_offset == offsetof(struct sctp_header, sctp_csum)
> + && l4proto == IPPROTO_SCTP) {
> + dp_packet_hwol_set_csum_sctp(b);
I was thinking about the possibility of having a flag to indicate
something like dp_packet_hwol_set_csum_any(), it would be the union of
tcp, udp, and sctp. Then possibly on egress or miniflow_extract, we
could check for that value and populate the correct flag.
WDYT?
Cheers,
M
> + l4_csum_partial = true;
> + }
> + }
> + if (l4_csum_partial) {
> + dp_packet_ol_set_l4_csum_partial(b);
> + } else {
> + ovs_be16 *csum_l4;
> + void *l4;
> +
> + COVERAGE_INC(netdev_linux_unknown_l4_csum);
> +
> + csum_l4 = dp_packet_at(b, csum_start + csum_offset,
> + sizeof *csum_l4);
> + if (!csum_l4) {
> + return EINVAL;
> + }
> +
> + l4 = dp_packet_at(b, csum_start, dp_packet_size(b) - csum_start);
> + *csum_l4 = csum(l4, dp_packet_size(b) - csum_start);
> +
> + if (l4proto == IPPROTO_TCP) {
> + dp_packet_hwol_set_csum_tcp(b);
> + dp_packet_ol_set_l4_csum_good(b);
> + } else if (l4proto == IPPROTO_UDP) {
> + dp_packet_hwol_set_csum_udp(b);
> + dp_packet_ol_set_l4_csum_good(b);
> + } else if (l4proto == IPPROTO_SCTP) {
> + dp_packet_hwol_set_csum_sctp(b);
> + dp_packet_ol_set_l4_csum_good(b);
> + }
> }
> - /* The packet has offloaded checksum. However, there is no
> - * additional information like the protocol used, so it would
> - * require to parse the packet here. The checksum starting point
> - * and offset are going to be verified when the packet headers
> - * are parsed during miniflow extraction. */
> - b->csum_start = (OVS_FORCE uint16_t) vnet->csum_start;
> - b->csum_offset = (OVS_FORCE uint16_t) vnet->csum_offset;
> - } else {
> - b->csum_start = 0;
> - b->csum_offset = 0;
> }
>
> int ret = 0;
> diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
> index cbb875bd26..d49597f380 100644
> --- a/lib/netdev-native-tnl.c
> +++ b/lib/netdev-native-tnl.c
> @@ -338,11 +338,7 @@ netdev_tnl_push_udp_header(const struct netdev *netdev
> OVS_UNUSED,
> } else {
> dp_packet_hwol_set_csum_udp(packet);
> }
> - }
> -
> - if (packet->csum_start && packet->csum_offset) {
> - dp_packet_ol_set_l4_csum_partial(packet);
> - } else if (!udp->udp_csum) {
> + } else {
> dp_packet_ol_set_l4_csum_good(packet);
> }
>
> --
> 2.48.1
>
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev