I have tested this patch in linux with ovs version 2.8.2, and it seems worked.
>From fa24d308d40f37f890fead0b79ac1f0f7baa28ba Mon Sep 17 00:00:00 2001
From: hzchenyuefang <hzchenyuef...@corp.netease.com>
Date: Sat, 9 Nov 2019 10:14:23 +0800
Subject: [PATCH 1/1] fix skb_hash problem when sending from internal port
first packet need come to userspace first and when execute in datapath,
skb->hash value is changed to 0 hence the packet's skb->hash value need
to recalculate when used. This may happend in such conditions: send
packet from internal port, and then do vxlan encapsulting, outer udp
header source port is calculated by skb->hash(see function
udp_flow_src_port), so same tcp session packet may have different outer
udp source port, this may affect load blance.
---
datapath-windows/ovsext/BufferMgmt.h | 1 +
datapath-windows/ovsext/DpInternal.h | 1 +
datapath-windows/ovsext/User.c | 11 ++++++++++-
datapath/datapath.c | 17 ++++++++++++++++-
datapath/datapath.h | 0
datapath/linux/compat/include/linux/openvswitch.h | 1 +
lib/dpif-netlink.c | 7 ++++++-
lib/dpif.h | 2 ++
ofproto/ofproto-dpif-upcall.c | 11 ++++++++++-
9 files changed, 47 insertions(+), 4 deletions(-)
mode change 100644 => 100755 datapath-windows/ovsext/BufferMgmt.h
mode change 100644 => 100755 datapath-windows/ovsext/DpInternal.h
mode change 100644 => 100755 datapath-windows/ovsext/User.c
mode change 100644 => 100755 datapath/datapath.c
mode change 100644 => 100755 datapath/datapath.h
mode change 100644 => 100755 lib/dpif-netlink.c
mode change 100644 => 100755 lib/dpif.h
mode change 100644 => 100755 ofproto/ofproto-dpif-upcall.c
diff --git a/datapath-windows/ovsext/BufferMgmt.h
b/datapath-windows/ovsext/BufferMgmt.h
old mode 100644
new mode 100755
index dcf310a..34f50a1
--- a/datapath-windows/ovsext/BufferMgmt.h
+++ b/datapath-windows/ovsext/BufferMgmt.h
@@ -59,6 +59,7 @@ typedef union _OVS_BUFFER_CONTEXT {
UINT32 dataOffsetDelta;
};
UINT16 mru;
+ UINT32 skb_hash;
};
UINT64 value[MEM_ALIGN_SIZE(32) >> 3];
diff --git a/datapath-windows/ovsext/DpInternal.h
b/datapath-windows/ovsext/DpInternal.h
old mode 100644
new mode 100755
index 3e351b7..8bb7110
--- a/datapath-windows/ovsext/DpInternal.h
+++ b/datapath-windows/ovsext/DpInternal.h
@@ -300,6 +300,7 @@ typedef struct OvsPacketExecute {
uint32_t dpNo;
uint32_t inPort;
uint16 mru;
+ uint32 skb_hash;
uint32_t packetLen;
uint32_t actionsLen;
PNL_MSG_HDR nlMsgHdr;
diff --git a/datapath-windows/ovsext/User.c b/datapath-windows/ovsext/User.c
old mode 100644
new mode 100755
index 4693a8b..4439379
--- a/datapath-windows/ovsext/User.c
+++ b/datapath-windows/ovsext/User.c
@@ -292,7 +292,8 @@ OvsNlExecuteCmdHandler(POVS_USER_PARAMS_CONTEXT
usrParamsCtx,
[OVS_PACKET_ATTR_USERDATA] = {.type = NL_A_UNSPEC, .optional = TRUE},
[OVS_PACKET_ATTR_EGRESS_TUN_KEY] = {.type = NL_A_UNSPEC,
.optional = TRUE},
- [OVS_PACKET_ATTR_MRU] = { .type = NL_A_U16, .optional = TRUE }
+ [OVS_PACKET_ATTR_MRU] = { .type = NL_A_U16, .optional = TRUE },
+ [OVS_PACKET_ATTR_SKB_HASH] = { .type = NL_A_U16, .optional = TRUE }
};
RtlZeroMemory(&execute, sizeof(OvsPacketExecute));
@@ -394,6 +395,9 @@ _MapNlAttrToOvsPktExec(PNL_MSG_HDR nlMsgHdr, PNL_ATTR
*nlAttrs,
if (nlAttrs[OVS_PACKET_ATTR_MRU]) {
execute->mru = NlAttrGetU16(nlAttrs[OVS_PACKET_ATTR_MRU]);
}
+ if (nlAttrs[OVS_PACKET_ATTR_SKB_HASH]){
+ execute->skb_hash = NlAttrGetU32(nlAttrs[OVS_PACKET_ATTR_SKB_HASH]);
+ }
}
NTSTATUS
@@ -1101,6 +1105,11 @@ OvsCreateQueueNlPacket(PVOID userData,
goto fail;
}
}
+ if (ctx->skb_hash) {
+ if (!NlMsgPutTailU32(&nlBuf, OVS_PACKET_ATTR_SKB_HASH,
(UINT32)ctx->skb_hash)){
+ goto fail;
+ }
+ }
/* XXX must send OVS_PACKET_ATTR_EGRESS_TUN_KEY if set by vswtchd */
if (userData){
diff --git a/datapath/datapath.c b/datapath/datapath.c
old mode 100644
new mode 100755
index b565fc5..9559b4d
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -287,7 +287,6 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct
sw_flow_key *key)
stats_counter = &stats->n_missed;
goto out;
}
-
ovs_flow_stats_update(flow, key->tp.flags, skb);
sf_acts = rcu_dereference(flow->sf_acts);
ovs_execute_actions(dp, skb, sf_acts, key);
@@ -521,6 +520,14 @@ static int queue_userspace_packet(struct datapath *dp,
struct sk_buff *skb,
pad_packet(dp, user_skb);
}
+ if (skb->hash) {
+ if (nla_put_u32(user_skb, OVS_PACKET_ATTR_SKB_HASH, skb->hash)) {
+ err = -ENOBUFS;
+ goto out;
+ }
+ pad_packet(dp, user_skb);
+ }
+
/* Add OVS_PACKET_ATTR_LEN when packet is truncated */
if (cutlen > 0) {
if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN,
@@ -571,6 +578,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb,
struct genl_info *info)
struct datapath *dp;
struct vport *input_vport;
u16 mru = 0;
+ u32 hash = 0;
int len;
int err;
bool log = !a[OVS_PACKET_ATTR_PROBE];
@@ -596,6 +604,12 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb,
struct genl_info *info)
}
OVS_CB(packet)->mru = mru;
+ if (a[OVS_PACKET_ATTR_SKB_HASH]) {
+ hash = nla_get_u32(a[OVS_PACKET_ATTR_SKB_HASH]);
+ packet->hash = hash;
+ packet->sw_hash = 1;
+ }
+
/* Build an sw_flow for sending this packet. */
flow = ovs_flow_alloc();
err = PTR_ERR(flow);
@@ -657,6 +671,7 @@ static const struct nla_policy
packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
[OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED },
[OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG },
[OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 },
+ [OVS_PACKET_ATTR_SKB_HASH] = { .type = NLA_U32 },
};
static struct genl_ops dp_packet_genl_ops[] = {
diff --git a/datapath/datapath.h b/datapath/datapath.h
old mode 100644
new mode 100755
diff --git a/datapath/linux/compat/include/linux/openvswitch.h
b/datapath/linux/compat/include/linux/openvswitch.h
index 561f895..3f00e50 100644
--- a/datapath/linux/compat/include/linux/openvswitch.h
+++ b/datapath/linux/compat/include/linux/openvswitch.h
@@ -210,6 +210,7 @@ enum ovs_packet_attr {
error logging should be suppressed. */
OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */
OVS_PACKET_ATTR_LEN, /* Packet size before truncation. */
+ OVS_PACKET_ATTR_SKB_HASH, /* Packet size before truncation. */
__OVS_PACKET_ATTR_MAX
};
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
old mode 100644
new mode 100755
index 7cb0f7c..1277656
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -1764,6 +1764,9 @@ dpif_netlink_encode_execute(int dp_ifindex, const struct
dpif_execute *d_exec,
if (d_exec->mtu) {
nl_msg_put_u16(buf, OVS_PACKET_ATTR_MRU, d_exec->mtu);
}
+ if (d_exec->skb_hash) {
+ nl_msg_put_u32(buf, OVS_PACKET_ATTR_SKB_HASH, d_exec->skb_hash);
+ }
}
/* Executes, against 'dpif', up to the first 'n_ops' operations in 'ops'.
@@ -2408,7 +2411,8 @@ parse_odp_packet(const struct dpif_netlink *dpif, struct
ofpbuf *buf,
[OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_UNSPEC, .optional = true },
[OVS_PACKET_ATTR_EGRESS_TUN_KEY] = { .type = NL_A_NESTED, .optional =
true },
[OVS_PACKET_ATTR_ACTIONS] = { .type = NL_A_NESTED, .optional = true },
- [OVS_PACKET_ATTR_MRU] = { .type = NL_A_U16, .optional = true }
+ [OVS_PACKET_ATTR_MRU] = { .type = NL_A_U16, .optional = true },
+ [OVS_PACKET_ATTR_SKB_HASH] = { .type = NL_A_U32, .optional = true }
};
struct ofpbuf b = ofpbuf_const_initializer(buf->data, buf->size);
@@ -2441,6 +2445,7 @@ parse_odp_packet(const struct dpif_netlink *dpif, struct
ofpbuf *buf,
upcall->out_tun_key = a[OVS_PACKET_ATTR_EGRESS_TUN_KEY];
upcall->actions = a[OVS_PACKET_ATTR_ACTIONS];
upcall->mru = a[OVS_PACKET_ATTR_MRU];
+ upcall->skb_hash = a[OVS_PACKET_ATTR_SKB_HASH];
/* Allow overwriting the netlink attribute header without reallocating. */
dp_packet_use_stub(&upcall->packet,
diff --git a/lib/dpif.h b/lib/dpif.h
old mode 100644
new mode 100755
index ab898f4..a9d787c
--- a/lib/dpif.h
+++ b/lib/dpif.h
@@ -706,6 +706,7 @@ struct dpif_execute {
bool probe; /* Suppress error messages. */
unsigned int mtu; /* Maximum transmission unit to fragment.
0 if not a fragmented packet */
+ unsigned int skb_hash; /*for internal port transmit*/
const struct flow *flow; /* Flow extracted from 'packet'. */
/* Input, but possibly modified as a side effect of execution. */
@@ -795,6 +796,7 @@ struct dpif_upcall {
size_t key_len; /* Length of 'key' in bytes. */
ovs_u128 ufid; /* Unique flow identifier for 'key'. */
struct nlattr *mru; /* Maximum receive unit. */
+ struct nlattr *skb_hash; /*for internal port transmit packet*/
struct nlattr *cutlen; /* Number of bytes shrink from the end. */
/* DPIF_UC_ACTION only. */
diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
old mode 100644
new mode 100755
index e28d3bf..4b3ffdd
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -209,6 +209,7 @@ struct upcall {
ofp_port_t in_port; /* OpenFlow in port, or OFPP_NONE. */
uint16_t mru; /* If !0, Maximum receive unit of
fragmented IP packet */
+ uint32_t skb_hash;
enum dpif_upcall_type type; /* Datapath type of the upcall. */
const struct nlattr *userdata; /* Userdata for DPIF_UC_ACTION Upcalls. */
@@ -772,6 +773,7 @@ recv_upcalls(struct handler *handler)
struct upcall *upcall = &upcalls[n_upcalls];
struct flow *flow = &flows[n_upcalls];
unsigned int mru;
+ unsigned int skb_hash;
int error;
ofpbuf_use_stub(recv_buf, recv_stubs[n_upcalls],
@@ -792,6 +794,12 @@ recv_upcalls(struct handler *handler)
mru = 0;
}
+ if (dupcall->skb_hash){
+ skb_hash = nl_attr_get_u32(dupcall->skb_hash);
+ } else {
+ skb_hash = 0;
+ }
+
error = upcall_receive(upcall, udpif->backer, &dupcall->packet,
dupcall->type, dupcall->userdata, flow, mru,
&dupcall->ufid, PMD_ID_NULL);
@@ -816,7 +824,7 @@ recv_upcalls(struct handler *handler)
upcall->out_tun_key = dupcall->out_tun_key;
upcall->actions = dupcall->actions;
-
+ upcall->skb_hash = skb_hash;
pkt_metadata_from_flow(&dupcall->packet.md, flow);
flow_extract(&dupcall->packet, flow);
@@ -1470,6 +1478,7 @@ handle_upcalls(struct udpif *udpif, struct upcall
*upcalls,
op->dop.u.execute.needs_help = (upcall->xout.slow & SLOW_ACTION)
!= 0;
op->dop.u.execute.probe = false;
op->dop.u.execute.mtu = upcall->mru;
+ op->dop.u.execute.skb_hash = upcall->skb_hash;
}
}
--
2.1.4
At 2019-11-06 12:04:57, "Tonghao Zhang" <xiangxia.m....@gmail.com> wrote:
>On Mon, Nov 4, 2019 at 7:44 PM ychen <ychen103...@163.com> wrote:
>>
>>
>>
>> we can easily reproduce this phenomenon by using tcp socket stream sending
>> from ovs internal port.
>>
>>
>>
>>
>> At 2019-10-30 19:49:16, "ychen" <ychen103...@163.com> wrote:
>>
>> Hi,
>> when we use docker to establish tcp session, we found that the packet
>> which must do upcall to userspace has different encapsulated udp source port
>> with packet that only needs do datapath flow forwarding.
>>
>>
>> After some code research and kprobe debug, we found the following:
>> 1. use udp_flow_src_port() to get the port
>> so when both skb->l4_hash==0 and skb->sw_hash==0, 5 tuple data will
>> be used to calculate the skb->hash
>> 2. when first packet of tcp session coming, packet needs do upcall to
>> userspace, and then ovs_packet_cmd_execute() called
>> new skb is allocated with both l4_hash and sw_hash set to 0
>> 3. when none first packet of tcp sesion coming, function
>> ovs_dp_process_packet()->ovs_execute_actions() called,
>> and this time original skb is reserved.
>> when packet has do ip_forward(), kprobe debug prints skb->l4_hash=1,
>> sw_hash=0
>> 4. we searched kernel code, and found such code:
>> skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk)
>> { if (sk->sk_txhash) {
>> skb->l4_hash = 1;
>> skb->hash = sk->sk_txhash;
>> }
>> }
>> static inline void sk_set_txhash(struct sock *sk)
>> {sk->sk_txhash = net_tx_rndhash(); ==============>it is a random
>> value!!}
>> 5. so let's have a summary:
>> when packet is processing only in datapath flow, skb->hash is random
>> value for the same tcp session?
>> when packet needs processing first to userspace, than kernel space,
>> skb->hash is calculated by 5 tuple?
>>
>> Our testing enviroment:
>> debian 9, kernel 4.9.65
>> ovs version: 2.8.2
>>
>>
>> Simple topo is like this:
>> docker_eth0<-------+
>> | veth ip_forward
>>
>> +host_veth0<----------------->port-eth(ovs-ineternal)
>> host_veth0 and port-eth device stay in physical host.
>>
>>
>> So can we treat skb->hash as a attribute, when send packet to userspace,
>> encode this attribute;
>> and then do ovs_packet_cmd_execute(), retrieve the same hash value from
>> userspace?
>>
>>
>> another important tips:
>> if we send packets from qemu based tap device, vxlan source port is always
>> same for the same tcp session;
>> only when send packets from docker in which packets will do ip_forward,
>> vxlan source port may different for same tcp session.
>Should be fixed. The patch will be sent.
>>
>>
>>
>>
>>
>>
>> _______________________________________________
>> dev mailing list
>> d...@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev