On 7/23/25 2:50 PM, Somnath Chatterjee via dev wrote:
> When IP layer fragments packets, L4 header
> is only present in the first packet. The next fragmented
> packets donot contain L4 headers. So hash calculation based
> on 5tuple should ignore the L4 headers for all fragmented
> packets.
> If we consider L4 header for any fragments , say first or
> subsequent ones, the hash will vary which may result in
> selecting different dpdk port for egressing out packet.
> 
> Signed-off-by: Somnath Chatterjee <somnath.b.chatter...@ericsson.com>
> ---
>  lib/flow.c             |  41 +++++++----
>  tests/automake.mk      |   1 +
>  tests/library.at       |   4 +
>  tests/test-flow-hash.c | 161 +++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 193 insertions(+), 14 deletions(-)
>  create mode 100644 tests/test-flow-hash.c

Hi, Somnath.  Thanks for the update!

This patch failed CI, because of compilation failures in the test file.
Please, make sure it compiles without warnings in the next version.

Also, please add a space after the colon in the subject line, this will
fix the checkpatch complaint.  At the same time Ovs should be OVS.

See some more comments for the code below.

Best regards, Ilya Maximets.

> 
> diff --git a/lib/flow.c b/lib/flow.c
> index c2e1e4ad6..464678012 100644
> --- a/lib/flow.c
> +++ b/lib/flow.c
> @@ -1075,10 +1075,12 @@ miniflow_extract(struct dp_packet *packet, struct 
> miniflow *dst)
>                      miniflow_push_be16(mf, tp_dst, tcp->tcp_dst);
>                      miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
>                      miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
> -                    if (dl_type == htons(ETH_TYPE_IP)) {
> -                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> -                    } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> -                        dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
> +                    if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
> +                        if (dl_type == htons(ETH_TYPE_IP)) {
> +                            dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> +                        } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> +                            dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
> +                        }
>                      }

This will make processing of fragmented packets slower as we'll no longer
calculete their hashes while the cache is hot.  I'd suggest we instead add
a new parameter 'bool frag' to the dp_packet_update_rss_hash_ipv4_tcp_udp()
and a corresponding ipv6 function and call them like this:

                        dp_packet_update_rss_hash_ipv4_tcp_udp(
                            packet, !!(nw_frag & FLOW_NW_FRAG_MASK));

Inside the function, part of the code that adds L4 ports would go under the
if (frag) case.

This way we can keep performance for fragmented packets mostly intact.

Note: these functions are also called form the avx512 code, but the frag
argument there will always be false, as that code doesn't match fragmented
traffic.

>                      dp_packet_ol_l4_csum_check_partial(packet);
>                      if (dp_packet_l4_checksum_good(packet)
> @@ -1095,10 +1097,12 @@ miniflow_extract(struct dp_packet *packet, struct 
> miniflow *dst)
>                  miniflow_push_be16(mf, tp_dst, udp->udp_dst);
>                  miniflow_push_be16(mf, ct_tp_src, ct_tp_src);
>                  miniflow_push_be16(mf, ct_tp_dst, ct_tp_dst);
> -                if (dl_type == htons(ETH_TYPE_IP)) {
> -                    dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> -                } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> -                    dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
> +                if (!(nw_frag & FLOW_NW_FRAG_MASK)) {
> +                    if (dl_type == htons(ETH_TYPE_IP)) {
> +                        dp_packet_update_rss_hash_ipv4_tcp_udp(packet);
> +                    } else if (dl_type == htons(ETH_TYPE_IPV6)) {
> +                        dp_packet_update_rss_hash_ipv6_tcp_udp(packet);
> +                    }
>                  }
>                  dp_packet_ol_l4_csum_check_partial(packet);
>                  if (dp_packet_l4_checksum_good(packet)
> @@ -2357,6 +2361,7 @@ miniflow_hash_5tuple(const struct miniflow *flow, 
> uint32_t basis)
>      if (flow) {
>          ovs_be16 dl_type = MINIFLOW_GET_BE16(flow, dl_type);
>          uint8_t nw_proto;
> +        uint8_t nw_frag;
>  
>          if (dl_type == htons(ETH_TYPE_IPV6)) {
>              struct flowmap map = FLOWMAP_EMPTY_INITIALIZER;
> @@ -2377,10 +2382,12 @@ miniflow_hash_5tuple(const struct miniflow *flow, 
> uint32_t basis)
>          }
>  
>          nw_proto = MINIFLOW_GET_U8(flow, nw_proto);
> +        nw_frag = MINIFLOW_GET_U8(flow, nw_frag);
>          hash = hash_add(hash, nw_proto);
> -        if (nw_proto != IPPROTO_TCP && nw_proto != IPPROTO_UDP
> +        if ((nw_frag & FLOW_NW_FRAG_MASK) ||
> +            (nw_proto != IPPROTO_TCP && nw_proto != IPPROTO_UDP
>              && nw_proto != IPPROTO_SCTP && nw_proto != IPPROTO_ICMP
> -            && nw_proto != IPPROTO_ICMPV6) {
> +            && nw_proto != IPPROTO_ICMPV6)) {
>              goto out;
>          }

This if sttement is very busy already.  Since it's just a goto out, I'd
suggest to create a separate if statement for the fragmentation check instead.
E.g.:

    nw_proto = MINIFLOW_GET_U8(flow, nw_proto);
    hash = hash_add(hash, nw_proto);

    nw_frag = MINIFLOW_GET_U8(flow, nw_frag);
    if (nw_frag & FLOW_NW_FRAG_MASK) {
        goto out;
    }

    if (nw_proto != IPPROTO_TCP && nw_proto != IPPROTO_UDP
        ...

>  
> @@ -2420,9 +2427,10 @@ flow_hash_5tuple(const struct flow *flow, uint32_t 
> basis)
>          }
>  
>          hash = hash_add(hash, flow->nw_proto);
> -        if (flow->nw_proto != IPPROTO_TCP && flow->nw_proto != IPPROTO_UDP
> +        if ((flow->nw_frag & FLOW_NW_FRAG_MASK) ||
> +            (flow->nw_proto != IPPROTO_TCP && flow->nw_proto != IPPROTO_UDP
>              && flow->nw_proto != IPPROTO_SCTP && flow->nw_proto != 
> IPPROTO_ICMP
> -            && flow->nw_proto != IPPROTO_ICMPV6) {
> +            && flow->nw_proto != IPPROTO_ICMPV6)) {
>              goto out;
>          }

Same here.

>  
> @@ -2430,6 +2438,7 @@ flow_hash_5tuple(const struct flow *flow, uint32_t 
> basis)
>          hash = hash_add(hash,
>                          ((const uint32_t *)flow)[offsetof(struct flow, 
> tp_src)
>                                                   / sizeof(uint32_t)]);
> +

Unrelated change.

>      }
>  out:
>      return hash_finish(hash, 42); /* Arbitrary number. */
> @@ -2467,7 +2476,9 @@ flow_hash_symmetric_l4(const struct flow *flow, 
> uint32_t basis)
>      if (fields.eth_type == htons(ETH_TYPE_IP)) {
>          fields.ipv4_addr = flow->nw_src ^ flow->nw_dst;
>          fields.ip_proto = flow->nw_proto;
> -        if (fields.ip_proto == IPPROTO_TCP || fields.ip_proto == 
> IPPROTO_SCTP) {
> +        if (!(flow->nw_frag & FLOW_NW_FRAG_MASK) &&
> +            (fields.ip_proto == IPPROTO_TCP ||
> +            fields.ip_proto == IPPROTO_SCTP)) {

Shift this line 1 space to the right to visually align with the previous
fields.ip_proto.

>              fields.tp_port = flow->tp_src ^ flow->tp_dst;
>          }
>      } else if (fields.eth_type == htons(ETH_TYPE_IPV6)) {
> @@ -2479,7 +2490,9 @@ flow_hash_symmetric_l4(const struct flow *flow, 
> uint32_t basis)
>              ipv6_addr[i] = a[i] ^ b[i];
>          }
>          fields.ip_proto = flow->nw_proto;
> -        if (fields.ip_proto == IPPROTO_TCP || fields.ip_proto == 
> IPPROTO_SCTP) {
> +        if (!(flow->nw_frag & FLOW_NW_FRAG_MASK) &&
> +            (fields.ip_proto == IPPROTO_TCP ||
> +            fields.ip_proto == IPPROTO_SCTP)) {

Same here.

>              fields.tp_port = flow->tp_src ^ flow->tp_dst;
>          }
>      }
> diff --git a/tests/automake.mk b/tests/automake.mk
> index 59f538761..7586d75f7 100644
> --- a/tests/automake.mk
> +++ b/tests/automake.mk
> @@ -458,6 +458,7 @@ tests_ovstest_SOURCES = \
>       tests/test-cooperative-multitasking.c \
>       tests/test-csum.c \
>       tests/test-flows.c \
> +        tests/test-flow-hash.c \

Tabs should be used for alignemnt in this file.

>       tests/test-hash.c \
>       tests/test-heap.c \
>       tests/test-hindex.c \
> diff --git a/tests/library.at b/tests/library.at
> index 82ac80a27..a0f5f30fa 100644
> --- a/tests/library.at
> +++ b/tests/library.at
> @@ -308,3 +308,7 @@ AT_CHECK([ovstest 
> test-cooperative-multitasking-nested-yield], [0], [], [dnl
>  cooperative_multitasking|ERR|Nested yield avoided, this is a bug! Enable 
> debug logging for more details.
>  ])
>  AT_CLEANUP
> +
> +AT_SETUP([test-flow-hash module])
> +AT_CHECK([ovstest test-flow-hash], [0], [], [ignore])
> +AT_CLEANUP
> diff --git a/tests/test-flow-hash.c b/tests/test-flow-hash.c
> new file mode 100644
> index 000000000..1d8e92a98
> --- /dev/null
> +++ b/tests/test-flow-hash.c
> @@ -0,0 +1,161 @@
> +#include <config.h>
> +#include <string.h>
> +#include "lib/flow.h"
> +#include "lib/dp-packet.h"
> +#include "tests/ovstest.h"
> +#include "util.h"
> +#include "openvswitch/vlog.h"
> +
> +VLOG_DEFINE_THIS_MODULE(test_flow_hash);
> +
> +struct packet_hashes {
> +    uint32_t flow_hash;
> +    uint32_t miniflow_hash;
> +};
> +
> +/* Utility to print hex dump of a packet */
> +static void
> +log_hex_dump(const char *label, const uint8_t *data, size_t len)
> +{
> +    char line[128];
> +    size_t i;
> +
> +    VLOG_INFO("%s (len=%lu):", label, (unsigned long) len);
> +    for (i = 0; i < len; i += 16) {
> +        size_t j, line_len = 0;
> +        line_len += snprintf(line + line_len, sizeof(line) - line_len,
> +                             "  %04x: ", i);
> +        for (j = 0; j < 16 && i + j < len; j++) {
> +            line_len += snprintf(line + line_len, sizeof(line) - line_len,
> +                                 "%02x ", data[i + j]);
> +        }
> +        VLOG_INFO("%s", line);
> +    }

Can use ds_put_hex_dump() instead.

> +}
> +
> +/* Extracts flow and miniflow from packet and computes their 5-tuple hashes 
> */
> +static struct packet_hashes
> +get_packet_hashes(const void *data, size_t size,
> +                  uint32_t basis,
> +                  const char *label)
> +{
> +    struct packet_hashes hashes;
> +    struct dp_packet packet;

An empty line between declarations and the code.

> +    dp_packet_use_const(&packet, data, size);
> +
> +    struct flow flow;

And here.

> +    flow_extract(&packet, &flow);
> +    hashes.flow_hash = flow_hash_5tuple(&flow, basis);
> +
> +    struct {
> +        struct miniflow miniflow;
> +        uint64_t buffer[FLOW_U64S];
> +    } mf_buf;

And here.

> +    miniflow_extract(&packet, &mf_buf.miniflow);
> +    hashes.miniflow_hash = miniflow_hash_5tuple(&mf_buf.miniflow, basis);
> +
> +    const char *frag_type;
> +    uint8_t frag_flags = flow.nw_frag & FLOW_NW_FRAG_MASK;

Declarations should generally go in the reverse x-mass tree order,
i.e. lines from the longest to the shortest.

> +
> +    if (frag_flags == 0) {
> +        frag_type = "not fragmented";
> +    } else if (frag_flags & FLOW_NW_FRAG_LATER) {
> +        frag_type = "non-first fragment";
> +    } else {
> +        frag_type = "first fragment or unknown";
> +    }
> +
> +    VLOG_INFO("%s:", label);
> +    VLOG_INFO("  Src IP: "IP_FMT", Dst IP: "IP_FMT,
> +              IP_ARGS(&flow.nw_src),
> +              IP_ARGS(&flow.nw_dst));
> +    VLOG_INFO("  Src Port: %u, Dst Port: %u, Proto: %u",
> +              ntohs(flow.tp_src),
> +              ntohs(flow.tp_dst),
> +              flow.nw_proto);
> +    VLOG_INFO("  Frag Type: %s", frag_type);
> +    VLOG_INFO("  flow_hash:     0x%08x", hashes.flow_hash);
> +    VLOG_INFO("  miniflow_hash: 0x%08x", hashes.miniflow_hash);

PRIx32

May also collct all this into a dynamic string and then also
put the hex dump into it with ds_put_hex_dump().

> +
> +    log_hex_dump(label, data, size);
> +
> +    return hashes;
> +}
> +
> +static void
> +test_udp_fragment_hash_consistency(int argc OVS_UNUSED,
> +                                   char *argv[] OVS_UNUSED)
> +{
> +    const uint32_t basis = 54321;
> +
> +    /* Packet 1: Normal, unfragmented UDP packet (DF bit set). */
> +    const uint8_t normal_packet[] = {
> +        0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
> +        0x00,0x00,0x00,0x02,0x08,0x00,0x45,0x00,
> +        0x00,0x28,0x12,0x34,0x40,0x00,0x40,0x11,
> +        0x7c,0xd1,0xc0,0xa8,0x01,0x01,0xc0,0xa8,
> +        0x01,0x02,0xc0,0x01,0xc0,0x02,0x00,0x18,
> +        0xab,0xcd,'a','b','c','d','e','f','g',
> +        'h','i','j','k','l'

Please, add some spaces between values and a trailing comma at the end.
Same for all the hex arrays below.

> +    };
> +
> +    /* Packet 2: First fragment (MF bit set, offset 0). */
> +    const uint8_t first_frag_packet[] = {
> +        0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
> +        0x00,0x00,0x00,0x02,0x08,0x00,0x45,0x00,
> +        0x00,0x1c,0x12,0x34,0x20,0x00,0x40,0x11,
> +        0x7c,0xd7,0xc0,0xa8,0x01,0x01,0xc0,0xa8,
> +        0x01,0x02,0xc0,0x01,0xc0,0x02,0x00,0x18,
> +        0xab,0xcd
> +    };
> +
> +    /* Packet 3: Second/Middle fragment (MF bit set, non-zero offset). */
> +    const uint8_t middle_frag_packet[] = {
> +        0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
> +        0x00,0x00,0x00,0x02,0x08,0x00,0x45,0x00,
> +        0x00,0x1c,0x12,0x34,0x20,0x01,0x40,0x11,
> +        0x7c,0xd6,0xc0,0xa8,0x01,0x01,0xc0,0xa8,
> +        0x01,0x02,'i','j','k','l','m','n','o','p'
> +    };
> +
> +    /* Packet 4: Last fragment (MF bit clear, non-zero offset). */
> +    const uint8_t last_frag_packet[] = {
> +        0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
> +        0x00,0x00,0x00,0x02,0x08,0x00,0x45,0x00,
> +        0x00,0x1a,0x12,0x34,0x00,0x02,0x40,0x11,
> +        0x9c,0xd5,0xc0,0xa8,0x01,0x01,0xc0,0xa8,
> +        0x01,0x02,'q','r','s','t','u','v'
> +    };
> +
> +    struct packet_hashes
> +        normal_h = get_packet_hashes(normal_packet,
> +                                     sizeof(normal_packet),
> +                                     basis,
> +                                     "Normal Packet");

sizeof of a variable should not have paranthesis, as per our coding style.
It is also better to split the line after the '=', e.g.:

    struct packet_hashes normal_h =
        get_packet_hashes(normal_packet, sizeof normal_packet,
                          basis, "Normal Packet");

Same for other calls below.

> +    struct packet_hashes
> +        first_frag_h = get_packet_hashes(first_frag_packet,
> +                                         sizeof(first_frag_packet),
> +                                         basis,
> +                                         "First Fragment");
> +    struct packet_hashes
> +        middle_frag_h = get_packet_hashes(middle_frag_packet,
> +                                          sizeof(middle_frag_packet),
> +                                          basis,
> +                                          "Middle Fragment");
> +    struct packet_hashes
> +        last_frag_h = get_packet_hashes(last_frag_packet,
> +                                        sizeof(last_frag_packet),
> +                                        basis,
> +                                        "Last Fragment");
> +
> +    ovs_assert(normal_h.flow_hash      == normal_h.miniflow_hash);
> +    ovs_assert(first_frag_h.flow_hash  == first_frag_h.miniflow_hash);
> +    ovs_assert(middle_frag_h.flow_hash == middle_frag_h.miniflow_hash);
> +    ovs_assert(last_frag_h.flow_hash   == last_frag_h.miniflow_hash);

These are aligned, but the ones below are not.  Seems inconsistent.
I'd say there is no need to align them, as they are a bit hard to
read regardless of alignment.

> +
> +    ovs_assert(normal_h.flow_hash != first_frag_h.flow_hash);
> +    ovs_assert(middle_frag_h.flow_hash == last_frag_h.flow_hash);
> +    ovs_assert(first_frag_h.flow_hash == middle_frag_h.flow_hash);
> +}
> +
> +OVSTEST_REGISTER("test-flow-hash", test_udp_fragment_hash_consistency);

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to