I have tested this patch in linux with ovs version 2.8.2, and it seems worked.




>From fa24d308d40f37f890fead0b79ac1f0f7baa28ba Mon Sep 17 00:00:00 2001
From: hzchenyuefang <hzchenyuef...@corp.netease.com>
Date: Sat, 9 Nov 2019 10:14:23 +0800
Subject: [PATCH 1/1] fix skb_hash problem when sending from internal port


first packet need come to userspace first and when execute in datapath,
skb->hash value is changed to 0 hence the packet's skb->hash value need
to recalculate when used.  This may happend in such conditions: send
packet from internal port, and then do vxlan encapsulting, outer udp
header source port is calculated by skb->hash(see function
udp_flow_src_port), so same tcp session packet may have different outer
udp source port, this may affect load blance.
---
 datapath-windows/ovsext/BufferMgmt.h              |  1 +
 datapath-windows/ovsext/DpInternal.h              |  1 +
 datapath-windows/ovsext/User.c                    | 11 ++++++++++-
 datapath/datapath.c                               | 17 ++++++++++++++++-
 datapath/datapath.h                               |  0
 datapath/linux/compat/include/linux/openvswitch.h |  1 +
 lib/dpif-netlink.c                                |  7 ++++++-
 lib/dpif.h                                        |  2 ++
 ofproto/ofproto-dpif-upcall.c                     | 11 ++++++++++-
 9 files changed, 47 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 datapath-windows/ovsext/BufferMgmt.h
 mode change 100644 => 100755 datapath-windows/ovsext/DpInternal.h
 mode change 100644 => 100755 datapath-windows/ovsext/User.c
 mode change 100644 => 100755 datapath/datapath.c
 mode change 100644 => 100755 datapath/datapath.h
 mode change 100644 => 100755 lib/dpif-netlink.c
 mode change 100644 => 100755 lib/dpif.h
 mode change 100644 => 100755 ofproto/ofproto-dpif-upcall.c


diff --git a/datapath-windows/ovsext/BufferMgmt.h 
b/datapath-windows/ovsext/BufferMgmt.h
old mode 100644
new mode 100755
index dcf310a..34f50a1
--- a/datapath-windows/ovsext/BufferMgmt.h
+++ b/datapath-windows/ovsext/BufferMgmt.h
@@ -59,6 +59,7 @@ typedef union _OVS_BUFFER_CONTEXT {
             UINT32 dataOffsetDelta;
         };
         UINT16 mru;
+        UINT32 skb_hash;
     };


     UINT64 value[MEM_ALIGN_SIZE(32) >> 3];
diff --git a/datapath-windows/ovsext/DpInternal.h 
b/datapath-windows/ovsext/DpInternal.h
old mode 100644
new mode 100755
index 3e351b7..8bb7110
--- a/datapath-windows/ovsext/DpInternal.h
+++ b/datapath-windows/ovsext/DpInternal.h
@@ -300,6 +300,7 @@ typedef struct OvsPacketExecute {
    uint32_t dpNo;
    uint32_t inPort;
    uint16 mru;
+   uint32 skb_hash;
    uint32_t packetLen;
    uint32_t actionsLen;
    PNL_MSG_HDR nlMsgHdr;
diff --git a/datapath-windows/ovsext/User.c b/datapath-windows/ovsext/User.c
old mode 100644
new mode 100755
index 4693a8b..4439379
--- a/datapath-windows/ovsext/User.c
+++ b/datapath-windows/ovsext/User.c
@@ -292,7 +292,8 @@ OvsNlExecuteCmdHandler(POVS_USER_PARAMS_CONTEXT 
usrParamsCtx,
         [OVS_PACKET_ATTR_USERDATA] = {.type = NL_A_UNSPEC, .optional = TRUE},
         [OVS_PACKET_ATTR_EGRESS_TUN_KEY] = {.type = NL_A_UNSPEC,
                                             .optional = TRUE},
-        [OVS_PACKET_ATTR_MRU] = { .type = NL_A_U16, .optional = TRUE }
+        [OVS_PACKET_ATTR_MRU] = { .type = NL_A_U16, .optional = TRUE },
+        [OVS_PACKET_ATTR_SKB_HASH] = { .type = NL_A_U16, .optional = TRUE }
     };


     RtlZeroMemory(&execute, sizeof(OvsPacketExecute));
@@ -394,6 +395,9 @@ _MapNlAttrToOvsPktExec(PNL_MSG_HDR nlMsgHdr, PNL_ATTR 
*nlAttrs,
     if (nlAttrs[OVS_PACKET_ATTR_MRU]) {
         execute->mru = NlAttrGetU16(nlAttrs[OVS_PACKET_ATTR_MRU]);
     }
+    if (nlAttrs[OVS_PACKET_ATTR_SKB_HASH]){
+        execute->skb_hash = NlAttrGetU32(nlAttrs[OVS_PACKET_ATTR_SKB_HASH]);
+    }
 }


 NTSTATUS
@@ -1101,6 +1105,11 @@ OvsCreateQueueNlPacket(PVOID userData,
             goto fail;
         }
     }
+    if (ctx->skb_hash) {
+        if (!NlMsgPutTailU32(&nlBuf, OVS_PACKET_ATTR_SKB_HASH, 
(UINT32)ctx->skb_hash)){
+            goto fail;
+        }
+    }


     /* XXX must send OVS_PACKET_ATTR_EGRESS_TUN_KEY if set by vswtchd */
     if (userData){
diff --git a/datapath/datapath.c b/datapath/datapath.c
old mode 100644
new mode 100755
index b565fc5..9559b4d
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -287,7 +287,6 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct 
sw_flow_key *key)
                stats_counter = &stats->n_missed;
                goto out;
        }
-
        ovs_flow_stats_update(flow, key->tp.flags, skb);
        sf_acts = rcu_dereference(flow->sf_acts);
        ovs_execute_actions(dp, skb, sf_acts, key);
@@ -521,6 +520,14 @@ static int queue_userspace_packet(struct datapath *dp, 
struct sk_buff *skb,
                pad_packet(dp, user_skb);
        }


+    if (skb->hash) {
+        if (nla_put_u32(user_skb, OVS_PACKET_ATTR_SKB_HASH, skb->hash)) {
+            err = -ENOBUFS;
+            goto out;
+        }
+        pad_packet(dp, user_skb);
+    }
+
        /* Add OVS_PACKET_ATTR_LEN when packet is truncated */
        if (cutlen > 0) {
                if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN,
@@ -571,6 +578,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, 
struct genl_info *info)
        struct datapath *dp;
        struct vport *input_vport;
        u16 mru = 0;
+       u32 hash = 0;
        int len;
        int err;
        bool log = !a[OVS_PACKET_ATTR_PROBE];
@@ -596,6 +604,12 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, 
struct genl_info *info)
        }
        OVS_CB(packet)->mru = mru;


+    if (a[OVS_PACKET_ATTR_SKB_HASH]) {
+        hash = nla_get_u32(a[OVS_PACKET_ATTR_SKB_HASH]);
+               packet->hash = hash;
+               packet->sw_hash = 1;
+    }
+
        /* Build an sw_flow for sending this packet. */
        flow = ovs_flow_alloc();
        err = PTR_ERR(flow);
@@ -657,6 +671,7 @@ static const struct nla_policy 
packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
        [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
        [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG },
        [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 },
+       [OVS_PACKET_ATTR_SKB_HASH] = { .type = NLA_U32 },
 };


 static struct genl_ops dp_packet_genl_ops[] = {
diff --git a/datapath/datapath.h b/datapath/datapath.h
old mode 100644
new mode 100755
diff --git a/datapath/linux/compat/include/linux/openvswitch.h 
b/datapath/linux/compat/include/linux/openvswitch.h
index 561f895..3f00e50 100644
--- a/datapath/linux/compat/include/linux/openvswitch.h
+++ b/datapath/linux/compat/include/linux/openvswitch.h
@@ -210,6 +210,7 @@ enum ovs_packet_attr {
                                       error logging should be suppressed. */
        OVS_PACKET_ATTR_MRU,        /* Maximum received IP fragment size. */
        OVS_PACKET_ATTR_LEN,            /* Packet size before truncation. */
+       OVS_PACKET_ATTR_SKB_HASH,   /* Packet size before truncation. */
        __OVS_PACKET_ATTR_MAX
 };


diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
old mode 100644
new mode 100755
index 7cb0f7c..1277656
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -1764,6 +1764,9 @@ dpif_netlink_encode_execute(int dp_ifindex, const struct 
dpif_execute *d_exec,
     if (d_exec->mtu) {
         nl_msg_put_u16(buf, OVS_PACKET_ATTR_MRU, d_exec->mtu);
     }
+    if (d_exec->skb_hash) {
+        nl_msg_put_u32(buf, OVS_PACKET_ATTR_SKB_HASH, d_exec->skb_hash);
+    }
 }


 /* Executes, against 'dpif', up to the first 'n_ops' operations in 'ops'.
@@ -2408,7 +2411,8 @@ parse_odp_packet(const struct dpif_netlink *dpif, struct 
ofpbuf *buf,
         [OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_UNSPEC, .optional = true },
         [OVS_PACKET_ATTR_EGRESS_TUN_KEY] = { .type = NL_A_NESTED, .optional = 
true },
         [OVS_PACKET_ATTR_ACTIONS] = { .type = NL_A_NESTED, .optional = true },
-        [OVS_PACKET_ATTR_MRU] = { .type = NL_A_U16, .optional = true }
+        [OVS_PACKET_ATTR_MRU] = { .type = NL_A_U16, .optional = true },
+        [OVS_PACKET_ATTR_SKB_HASH] = { .type = NL_A_U32, .optional = true }
     };


     struct ofpbuf b = ofpbuf_const_initializer(buf->data, buf->size);
@@ -2441,6 +2445,7 @@ parse_odp_packet(const struct dpif_netlink *dpif, struct 
ofpbuf *buf,
     upcall->out_tun_key = a[OVS_PACKET_ATTR_EGRESS_TUN_KEY];
     upcall->actions = a[OVS_PACKET_ATTR_ACTIONS];
     upcall->mru = a[OVS_PACKET_ATTR_MRU];
+    upcall->skb_hash = a[OVS_PACKET_ATTR_SKB_HASH];


     /* Allow overwriting the netlink attribute header without reallocating. */
     dp_packet_use_stub(&upcall->packet,
diff --git a/lib/dpif.h b/lib/dpif.h
old mode 100644
new mode 100755
index ab898f4..a9d787c
--- a/lib/dpif.h
+++ b/lib/dpif.h
@@ -706,6 +706,7 @@ struct dpif_execute {
     bool probe;                     /* Suppress error messages. */
     unsigned int mtu;               /* Maximum transmission unit to fragment.
                                        0 if not a fragmented packet */
+    unsigned int skb_hash;          /*for internal port transmit*/
     const struct flow *flow;         /* Flow extracted from 'packet'. */


     /* Input, but possibly modified as a side effect of execution. */
@@ -795,6 +796,7 @@ struct dpif_upcall {
     size_t key_len;             /* Length of 'key' in bytes. */
     ovs_u128 ufid;              /* Unique flow identifier for 'key'. */
     struct nlattr *mru;         /* Maximum receive unit. */
+    struct nlattr *skb_hash;    /*for internal port transmit packet*/
     struct nlattr *cutlen;      /* Number of bytes shrink from the end. */


     /* DPIF_UC_ACTION only. */
diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
old mode 100644
new mode 100755
index e28d3bf..4b3ffdd
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -209,6 +209,7 @@ struct upcall {
     ofp_port_t in_port;            /* OpenFlow in port, or OFPP_NONE. */
     uint16_t mru;                  /* If !0, Maximum receive unit of
                                       fragmented IP packet */
+    uint32_t skb_hash;


     enum dpif_upcall_type type;    /* Datapath type of the upcall. */
     const struct nlattr *userdata; /* Userdata for DPIF_UC_ACTION Upcalls. */
@@ -772,6 +773,7 @@ recv_upcalls(struct handler *handler)
         struct upcall *upcall = &upcalls[n_upcalls];
         struct flow *flow = &flows[n_upcalls];
         unsigned int mru;
+        unsigned int skb_hash;
         int error;


         ofpbuf_use_stub(recv_buf, recv_stubs[n_upcalls],
@@ -792,6 +794,12 @@ recv_upcalls(struct handler *handler)
             mru = 0;
         }


+        if (dupcall->skb_hash){
+            skb_hash = nl_attr_get_u32(dupcall->skb_hash);
+        } else {
+            skb_hash = 0;
+        }
+
         error = upcall_receive(upcall, udpif->backer, &dupcall->packet,
                                dupcall->type, dupcall->userdata, flow, mru,
                                &dupcall->ufid, PMD_ID_NULL);
@@ -816,7 +824,7 @@ recv_upcalls(struct handler *handler)


         upcall->out_tun_key = dupcall->out_tun_key;
         upcall->actions = dupcall->actions;
-
+        upcall->skb_hash = skb_hash;
         pkt_metadata_from_flow(&dupcall->packet.md, flow);
         flow_extract(&dupcall->packet, flow);


@@ -1470,6 +1478,7 @@ handle_upcalls(struct udpif *udpif, struct upcall 
*upcalls,
             op->dop.u.execute.needs_help = (upcall->xout.slow & SLOW_ACTION) 
!= 0;
             op->dop.u.execute.probe = false;
             op->dop.u.execute.mtu = upcall->mru;
+            op->dop.u.execute.skb_hash = upcall->skb_hash;
         }
     }


--
2.1.4









At 2019-11-06 12:04:57, "Tonghao Zhang" <xiangxia.m....@gmail.com> wrote:
>On Mon, Nov 4, 2019 at 7:44 PM ychen <ychen103...@163.com> wrote:
>>
>>
>>
>> we can easily reproduce this phenomenon by using tcp socket stream sending 
>> from ovs internal port.
>>
>>
>>
>>
>> At 2019-10-30 19:49:16, "ychen" <ychen103...@163.com> wrote:
>>
>> Hi,
>>    when we use docker to establish tcp session, we found that the packet 
>> which must do upcall to userspace has different encapsulated udp source port
>>    with packet that only needs do datapath flow forwarding.
>>
>>
>>    After some code research and kprobe debug,  we found the following:
>>    1.  use udp_flow_src_port() to get the port
>>         so when both skb->l4_hash==0 and skb->sw_hash==0, 5 tuple data will 
>> be used to calculate the skb->hash
>>     2. when first packet of tcp session coming,  packet needs do upcall to 
>> userspace, and then ovs_packet_cmd_execute() called
>>         new skb is allocated with both l4_hash and sw_hash set to 0
>>     3. when none first packet of tcp sesion coming, function 
>> ovs_dp_process_packet()->ovs_execute_actions() called,
>>         and this time original skb is reserved.
>>         when packet has do ip_forward(), kprobe debug prints skb->l4_hash=1, 
>> sw_hash=0
>>     4. we searched kernel code, and found such code:
>>          skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk)
>>         {  if (sk->sk_txhash) {
>> skb->l4_hash = 1;
>> skb->hash = sk->sk_txhash;
>>     }
>>        }
>>       static inline void sk_set_txhash(struct sock *sk)
>>       {sk->sk_txhash = net_tx_rndhash();  ==============>it is a random 
>> value!!}
>>    5. so let's have a summary:
>>        when packet is processing only in datapath flow, skb->hash is random 
>> value for the same tcp session?
>>        when packet needs processing first to userspace, than kernel space, 
>> skb->hash is calculated by 5 tuple?
>>
>>    Our testing enviroment:
>>    debian 9, kernel 4.9.65
>>    ovs version: 2.8.2
>>
>>
>>    Simple topo is like this:
>>    docker_eth0<-------+
>>                                   | veth                 ip_forward
>>                                  
>> +host_veth0<----------------->port-eth(ovs-ineternal)
>>     host_veth0 and port-eth device stay in physical host.
>>
>>
>>    So can we treat skb->hash as a attribute, when send packet to userspace, 
>> encode this attribute;
>>    and then do ovs_packet_cmd_execute(), retrieve the same hash value from 
>> userspace?
>>
>>
>>   another important tips:
>>  if we send packets from qemu based tap device, vxlan source port is always 
>> same for the same tcp session;
>>  only when send packets from docker in which packets will do ip_forward, 
>> vxlan source port may different for same tcp session.
>Should be fixed. The patch will be sent.
>>
>>
>>
>>
>>
>>
>> _______________________________________________
>> dev mailing list
>> d...@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to