Re: [ovs-dev] [PATCH v3] route-table: Filter route changes by interface.

2024-03-26 Thread Cheng Li
On Tue, Mar 26, 2024 at 06:42:02PM +0100, 【外部账号】Ilya Maximets wrote:
> On 3/24/24 13:16, Cheng Li wrote:
> > When ovs host is also a kubernets node, pod creation/deletion may
> > trigger route changes. As a result, ovs run route_table_reset().
> > As ovs do not care the kubernetes pod routes, route_table_reset()
> > is not neccessary.
> > 
> > Signed-off-by: Cheng Li 
> > ---
> 
> Hi, Cheng Li.  Thanks for the patch!
> 
> I'm a little confused by the use case though.  Could you explain
> a bit more why this is a problem (route dump is relatively fast,
> unless there are millions of routes) and how this change helps?
> 
> AFAIU, routes will not change much in kubernetes environment once
> the pod is initially created and configured and the port creation
> will trigger route cache reset anyway.
> 

Hi Ilya, thanks for reviewing this patch.

When calico is used as kubernets cni and IPinIP overlay mode[1] is
enabled, each time a pod created/deleted a route(dev tunl0) is
avertised across all cluster nodes.
```
# ip monitor route
10.233.75.61 via 11.46.8.90 dev tunl0 proto bird onlink
```

If we have a large cluster, route update may happen in high
frequency, which triggers lots of route_table_reset().

In route_table_reset(), all ovs route items are first deleted
then add latest kernel route items. There is a time gap between old
route items all deleted and new route items not ready. During this
gap, upcall/revalidation generate bad datapath flows.  As ovs does not
care kuberntes route changes, seems adding a filter is the simple way
to resolve this issue.

[1] https://docs.tigera.io/calico/latest/networking/configuring/vxlan-ipip

> And we will need to reset the cache when new interfaces are
> added/removed from the filter, because otherwise we'll have
> stale routes in there and the cache may become inconsistent.
> 

Make sense, will fix in next version.

> Best regards, Ilya Maximets
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3] route-table: Filter route changes by interface.

2024-03-24 Thread Cheng Li
When ovs host is also a kubernets node, pod creation/deletion may
trigger route changes. As a result, ovs run route_table_reset().
As ovs do not care the kubernetes pod routes, route_table_reset()
is not neccessary.

Signed-off-by: Cheng Li 
---

Notes:
v2: Add function definition for bsd and stub.
v3: Fix unused-parameter error.

 lib/route-table-bsd.c  |  5 +
 lib/route-table-stub.c |  5 +
 lib/route-table.c  | 41 ++---
 lib/route-table.h  |  1 +
 tests/system-route.at  | 51 ++
 vswitchd/bridge.c  |  3 +++
 vswitchd/vswitch.xml   | 10 +
 7 files changed, 113 insertions(+), 3 deletions(-)

diff --git a/lib/route-table-bsd.c b/lib/route-table-bsd.c
index 34d42cfab..4762e2194 100644
--- a/lib/route-table-bsd.c
+++ b/lib/route-table-bsd.c
@@ -205,3 +205,8 @@ void
 route_table_wait(void)
 {
 }
+
+void
+disable_notify_on_interfaces(const char *ifaces OVS_UNUSED)
+{
+}
diff --git a/lib/route-table-stub.c b/lib/route-table-stub.c
index dd0b096d4..cc13c5191 100644
--- a/lib/route-table-stub.c
+++ b/lib/route-table-stub.c
@@ -48,3 +48,8 @@ void
 route_table_wait(void)
 {
 }
+
+void
+disable_notify_on_interfaces(const char *ifaces OVS_UNUSED)
+{
+}
diff --git a/lib/route-table.c b/lib/route-table.c
index f1fe32714..eeff509f0 100644
--- a/lib/route-table.c
+++ b/lib/route-table.c
@@ -33,6 +33,7 @@
 #include "netlink-notifier.h"
 #include "netlink-socket.h"
 #include "openvswitch/ofpbuf.h"
+#include "lib/sset.h"
 #include "ovs-router.h"
 #include "packets.h"
 #include "rtnetlink.h"
@@ -82,6 +83,7 @@ static struct nln_notifier *route6_notifier = NULL;
 static struct nln_notifier *name_notifier = NULL;
 
 static bool route_table_valid = false;
+static struct sset disabled_ifaces = SSET_INITIALIZER(_ifaces);
 
 static void route_table_reset(void);
 static void route_table_handle_msg(const struct route_table_msg *);
@@ -92,6 +94,7 @@ static void route_map_clear(void);
 static void name_table_init(void);
 static void name_table_change(const struct rtnetlink_change *, void *);
 
+
 uint64_t
 route_table_get_change_seq(void)
 {
@@ -354,13 +357,45 @@ route_table_parse(struct ofpbuf *buf, struct 
route_table_msg *change)
 return ipv4 ? RTNLGRP_IPV4_ROUTE : RTNLGRP_IPV6_ROUTE;
 }
 
+void
+disable_notify_on_interfaces(const char *ifaces)
+{
+struct sset tmp_ifaces;
+
+if (ifaces) {
+sset_from_delimited_string(_ifaces, ifaces, ", ");
+} else {
+sset_init(_ifaces);
+}
+if (! sset_equals(_ifaces, _ifaces)) {
+const char *iface;
+struct ds ds = DS_EMPTY_INITIALIZER;
+
+sset_swap(_ifaces, _ifaces);
+SSET_FOR_EACH (iface, _ifaces) {
+ds_put_format(, " %s", iface);
+}
+VLOG_DBG_RL(, "route notify disabled interfaces: [%s]",
+ds_cstr());
+ds_destroy();
+}
+sset_destroy(_ifaces);
+}
+
 static void
-route_table_change(const struct route_table_msg *change OVS_UNUSED,
+route_table_change(const struct route_table_msg *change,
void *aux OVS_UNUSED)
 {
-if (!change || change->relevant) {
-route_table_valid = false;
+if (change) {
+if (!change->relevant) {
+return;
+}
+if (change->rd.ifname[0] != '\0' &&
+sset_contains(_ifaces, change->rd.ifname)) {
+return;
+}
 }
+route_table_valid = false;
 }
 
 static void
diff --git a/lib/route-table.h b/lib/route-table.h
index 3a02d737a..716e5bae0 100644
--- a/lib/route-table.h
+++ b/lib/route-table.h
@@ -33,4 +33,5 @@ void route_table_wait(void);
 bool route_table_fallback_lookup(const struct in6_addr *ip6_dst,
  char name[],
  struct in6_addr *gw6);
+void disable_notify_on_interfaces(const char *ifaces);
 #endif /* route-table.h */
diff --git a/tests/system-route.at b/tests/system-route.at
index c0ecad6cf..039255df7 100644
--- a/tests/system-route.at
+++ b/tests/system-route.at
@@ -128,3 +128,54 @@ OVS_WAIT_UNTIL([test $(ovs-appctl ovs/route/show | grep -c 
'p1-route') -eq 0 ])
 
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
+
+
+dnl Checks that disabled interface doesn't trigger route table refresh.
+AT_SETUP([ovs-route - filter by interface])
+AT_KEYWORDS([route])
+OVS_TRAFFIC_VSWITCHD_START()
+
+dnl Create tap port.
+on_exit 'ip link del p1-route; ip link del p2-route'
+AT_CHECK([ip tuntap add name p1-route mode tap])
+AT_CHECK([ip tuntap add name p2-route mode tap])
+AT_CHECK([ip link set p1-route up])
+AT_CHECK([ip link set p2-route up])
+
+dnl Add ip address.
+AT_CHECK([ip addr add 10.0.0.17/24 dev p1-route], [0], [stdout])
+AT_CHECK([ip addr add 10.0.1.17/24 dev p2-route], [0], [stdout])
+
+dnl Check that OVS catches route updates.
+OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/s

[ovs-dev] [PATCH v2] route-table: Filter route changes by interface.

2024-03-24 Thread Cheng Li
When ovs host is also a kubernets node, pod creation/deletion may
trigger route changes. As a result, ovs run route_table_reset().
As ovs do not care the kubernetes pod routes, route_table_reset()
is not neccessary.

Signed-off-by: Cheng Li 
---

Notes:
v2: Add function definition for bsd and stub.

 lib/route-table-bsd.c  |  5 +
 lib/route-table-stub.c |  5 +
 lib/route-table.c  | 39 ++--
 lib/route-table.h  |  1 +
 tests/system-route.at  | 51 ++
 vswitchd/bridge.c  |  3 +++
 vswitchd/vswitch.xml   | 10 +
 7 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/lib/route-table-bsd.c b/lib/route-table-bsd.c
index 34d42cfab..0967ad224 100644
--- a/lib/route-table-bsd.c
+++ b/lib/route-table-bsd.c
@@ -205,3 +205,8 @@ void
 route_table_wait(void)
 {
 }
+
+void
+disable_notify_on_interfaces(const char *ifaces)
+{
+}
diff --git a/lib/route-table-stub.c b/lib/route-table-stub.c
index dd0b096d4..843ff97f9 100644
--- a/lib/route-table-stub.c
+++ b/lib/route-table-stub.c
@@ -48,3 +48,8 @@ void
 route_table_wait(void)
 {
 }
+
+void
+disable_notify_on_interfaces(const char *ifaces)
+{
+}
diff --git a/lib/route-table.c b/lib/route-table.c
index f1fe32714..ed234fd87 100644
--- a/lib/route-table.c
+++ b/lib/route-table.c
@@ -33,6 +33,7 @@
 #include "netlink-notifier.h"
 #include "netlink-socket.h"
 #include "openvswitch/ofpbuf.h"
+#include "lib/sset.h"
 #include "ovs-router.h"
 #include "packets.h"
 #include "rtnetlink.h"
@@ -82,6 +83,7 @@ static struct nln_notifier *route6_notifier = NULL;
 static struct nln_notifier *name_notifier = NULL;
 
 static bool route_table_valid = false;
+static struct sset disabled_ifaces = SSET_INITIALIZER(_ifaces);
 
 static void route_table_reset(void);
 static void route_table_handle_msg(const struct route_table_msg *);
@@ -92,6 +94,7 @@ static void route_map_clear(void);
 static void name_table_init(void);
 static void name_table_change(const struct rtnetlink_change *, void *);
 
+
 uint64_t
 route_table_get_change_seq(void)
 {
@@ -354,13 +357,45 @@ route_table_parse(struct ofpbuf *buf, struct 
route_table_msg *change)
 return ipv4 ? RTNLGRP_IPV4_ROUTE : RTNLGRP_IPV6_ROUTE;
 }
 
+void
+disable_notify_on_interfaces(const char *ifaces)
+{
+struct sset tmp_ifaces;
+
+if (ifaces) {
+sset_from_delimited_string(_ifaces, ifaces, ", ");
+} else {
+sset_init(_ifaces);
+}
+if (! sset_equals(_ifaces, _ifaces)) {
+const char *iface;
+struct ds ds = DS_EMPTY_INITIALIZER;
+
+sset_swap(_ifaces, _ifaces);
+SSET_FOR_EACH (iface, _ifaces) {
+ds_put_format(, " %s", iface);
+}
+VLOG_DBG_RL(, "route notify disabled interfaces: [%s]",
+ds_cstr());
+ds_destroy();
+}
+sset_destroy(_ifaces);
+}
+
 static void
 route_table_change(const struct route_table_msg *change OVS_UNUSED,
void *aux OVS_UNUSED)
 {
-if (!change || change->relevant) {
-route_table_valid = false;
+if (change) {
+if (!change->relevant) {
+return;
+}
+if (change->rd.ifname[0] != '\0' &&
+sset_contains(_ifaces, change->rd.ifname)) {
+return;
+}
 }
+route_table_valid = false;
 }
 
 static void
diff --git a/lib/route-table.h b/lib/route-table.h
index 3a02d737a..716e5bae0 100644
--- a/lib/route-table.h
+++ b/lib/route-table.h
@@ -33,4 +33,5 @@ void route_table_wait(void);
 bool route_table_fallback_lookup(const struct in6_addr *ip6_dst,
  char name[],
  struct in6_addr *gw6);
+void disable_notify_on_interfaces(const char *ifaces);
 #endif /* route-table.h */
diff --git a/tests/system-route.at b/tests/system-route.at
index c0ecad6cf..039255df7 100644
--- a/tests/system-route.at
+++ b/tests/system-route.at
@@ -128,3 +128,54 @@ OVS_WAIT_UNTIL([test $(ovs-appctl ovs/route/show | grep -c 
'p1-route') -eq 0 ])
 
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
+
+
+dnl Checks that disabled interface doesn't trigger route table refresh.
+AT_SETUP([ovs-route - filter by interface])
+AT_KEYWORDS([route])
+OVS_TRAFFIC_VSWITCHD_START()
+
+dnl Create tap port.
+on_exit 'ip link del p1-route; ip link del p2-route'
+AT_CHECK([ip tuntap add name p1-route mode tap])
+AT_CHECK([ip tuntap add name p2-route mode tap])
+AT_CHECK([ip link set p1-route up])
+AT_CHECK([ip link set p2-route up])
+
+dnl Add ip address.
+AT_CHECK([ip addr add 10.0.0.17/24 dev p1-route], [0], [stdout])
+AT_CHECK([ip addr add 10.0.1.17/24 dev p2-route], [0], [stdout])
+
+dnl Check that OVS catches route updates.
+OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/show | grep -P 'p(1|2)-route' | 
sort], [dnl
+Cached: 10.0.0.0/24 dev p1-route SRC 10.0.0.17
+Cached: 10.0.0.17/32

[ovs-dev] [PATCH] route-table: Filter route changes by interface.

2024-03-23 Thread Cheng Li
When ovs host is also a kubernets node, pod creation/deletion may
trigger route changes. As a result, ovs run route_table_reset().
As ovs do not care the kubernetes pod routes, route_table_reset()
is not neccessary.

Signed-off-by: Cheng Li 
---
 lib/route-table.c | 39 +++--
 lib/route-table.h |  1 +
 tests/system-route.at | 51 +++
 vswitchd/bridge.c |  3 +++
 vswitchd/vswitch.xml  | 10 +
 5 files changed, 102 insertions(+), 2 deletions(-)

diff --git a/lib/route-table.c b/lib/route-table.c
index f1fe32714..ec8783923 100644
--- a/lib/route-table.c
+++ b/lib/route-table.c
@@ -33,6 +33,7 @@
 #include "netlink-notifier.h"
 #include "netlink-socket.h"
 #include "openvswitch/ofpbuf.h"
+#include "lib/sset.h"
 #include "ovs-router.h"
 #include "packets.h"
 #include "rtnetlink.h"
@@ -82,6 +83,7 @@ static struct nln_notifier *route6_notifier = NULL;
 static struct nln_notifier *name_notifier = NULL;
 
 static bool route_table_valid = false;
+static struct sset disabled_ifaces = SSET_INITIALIZER(_ifaces);
 
 static void route_table_reset(void);
 static void route_table_handle_msg(const struct route_table_msg *);
@@ -92,6 +94,32 @@ static void route_map_clear(void);
 static void name_table_init(void);
 static void name_table_change(const struct rtnetlink_change *, void *);
 
+void
+disable_notify_on_interfaces(const char *ifaces)
+{
+struct sset tmp_ifaces;
+
+if (ifaces) {
+sset_from_delimited_string(_ifaces, ifaces, ", ");
+} else {
+sset_init(_ifaces);
+}
+if (! sset_equals(_ifaces, _ifaces)) {
+const char *iface;
+struct ds ds = DS_EMPTY_INITIALIZER;
+
+sset_swap(_ifaces, _ifaces);
+SSET_FOR_EACH (iface, _ifaces) {
+ds_put_format(, " %s", iface);
+}
+VLOG_DBG_RL(, "route notify disabled interfaces: [%s]",
+ds_cstr());
+ds_destroy();
+}
+sset_destroy(_ifaces);
+
+}
+
 uint64_t
 route_table_get_change_seq(void)
 {
@@ -358,9 +386,16 @@ static void
 route_table_change(const struct route_table_msg *change OVS_UNUSED,
void *aux OVS_UNUSED)
 {
-if (!change || change->relevant) {
-route_table_valid = false;
+if (change) {
+if (!change->relevant) {
+return;
+}
+if (change->rd.ifname[0] != '\0' &&
+sset_contains(_ifaces, change->rd.ifname)) {
+return;
+}
 }
+route_table_valid = false;
 }
 
 static void
diff --git a/lib/route-table.h b/lib/route-table.h
index 3a02d737a..716e5bae0 100644
--- a/lib/route-table.h
+++ b/lib/route-table.h
@@ -33,4 +33,5 @@ void route_table_wait(void);
 bool route_table_fallback_lookup(const struct in6_addr *ip6_dst,
  char name[],
  struct in6_addr *gw6);
+void disable_notify_on_interfaces(const char *ifaces);
 #endif /* route-table.h */
diff --git a/tests/system-route.at b/tests/system-route.at
index c0ecad6cf..039255df7 100644
--- a/tests/system-route.at
+++ b/tests/system-route.at
@@ -128,3 +128,54 @@ OVS_WAIT_UNTIL([test $(ovs-appctl ovs/route/show | grep -c 
'p1-route') -eq 0 ])
 
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
+
+
+dnl Checks that disabled interface doesn't trigger route table refresh.
+AT_SETUP([ovs-route - filter by interface])
+AT_KEYWORDS([route])
+OVS_TRAFFIC_VSWITCHD_START()
+
+dnl Create tap port.
+on_exit 'ip link del p1-route; ip link del p2-route'
+AT_CHECK([ip tuntap add name p1-route mode tap])
+AT_CHECK([ip tuntap add name p2-route mode tap])
+AT_CHECK([ip link set p1-route up])
+AT_CHECK([ip link set p2-route up])
+
+dnl Add ip address.
+AT_CHECK([ip addr add 10.0.0.17/24 dev p1-route], [0], [stdout])
+AT_CHECK([ip addr add 10.0.1.17/24 dev p2-route], [0], [stdout])
+
+dnl Check that OVS catches route updates.
+OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/show | grep -P 'p(1|2)-route' | 
sort], [dnl
+Cached: 10.0.0.0/24 dev p1-route SRC 10.0.0.17
+Cached: 10.0.0.17/32 dev p1-route SRC 10.0.0.17 local
+Cached: 10.0.1.0/24 dev p2-route SRC 10.0.1.17
+Cached: 10.0.1.17/32 dev p2-route SRC 10.0.1.17 local])
+
+dnl Set disabled interface
+AT_CHECK([ovs-appctl vlog/set 'route_table,dbg'])
+get_log_next_line_num
+AT_CHECK([ovs-vsctl set Open_vSwitch . 
other_config:route-notify-disabled-interfaces="p2-route"])
+dnl expected log line: "route_table|DBG|route notify disabled interfaces: [ 
p2-route]"
+OVS_WAIT_UNTIL([tail -n +$LINENUM ovs-vswitchd.log | grep -P "notify disabled 
interfaces: . p2-route"])
+
+dnl Add a route with interface p1-route.
+AT_CHECK([ip route add 10.0.0.18/32 dev p1-route])
+OVS_WAIT_UNTIL_EQUAL([ovs-appctl ovs/route/show | grep 'p1-route' | sort], [dnl
+Cached: 10.0.0.0/24 dev p1-route SRC 10.0.0.17
+Cached: 10.0.0.17

[ovs-dev] [PATCH] conntrack: Do clean instead of forece expire.

2024-03-05 Thread Cheng Li
Force expire a connection and then create new connection of the same
tuple(cmap hash). This makes ct->conns cmap operation expensive(
within ct->ct_lock).

This patch cover the scenario by doing the clean immediately instead
of setting expire. Also this patch fix ct_clean debug log.

Signed-off-by: Cheng Li 
---
 lib/conntrack.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 8a7056bac..6e137daa6 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -461,12 +461,6 @@ conn_clean(struct conntrack *ct, struct conn *conn)
 atomic_count_dec(>n_conn);
 }
 
-static void
-conn_force_expire(struct conn *conn)
-{
-atomic_store_relaxed(>expiration, 0);
-}
-
 /* Destroys the connection tracker 'ct' and frees all the allocated memory.
  * The caller of this function must already have shut down packet input
  * and PMD threads (which would have been quiesced).  */
@@ -1030,7 +1024,10 @@ conn_update_state(struct conntrack *ct, struct dp_packet 
*pkt,
 case CT_UPDATE_NEW:
 if (conn_lookup(ct, >key_node[CT_DIR_FWD].key,
 now, NULL, NULL)) {
-conn_force_expire(conn);
+/* Instead of setting conn->expiration and let ct_clean thread
+ * clean the conn, doing the clean now to avoid duplicate hash
+ * in ct->conns for performance reason. */
+conn_clean(ct, conn);
 }
 create_new_conn = true;
 break;
@@ -1242,7 +1239,7 @@ process_one(struct conntrack *ct, struct dp_packet *pkt,
 if (OVS_UNLIKELY(force && ctx->reply && conn)) {
 if (conn_lookup(ct, >key_node[CT_DIR_FWD].key,
 now, NULL, NULL)) {
-conn_force_expire(conn);
+conn_clean(ct, conn);
 }
 conn = NULL;
 }
@@ -1429,7 +1426,8 @@ conntrack_get_sweep_interval(struct conntrack *ct)
 }
 
 static size_t
-ct_sweep(struct conntrack *ct, struct rculist *list, long long now)
+ct_sweep(struct conntrack *ct, struct rculist *list, long long now,
+ size_t *cleaned_count)
 OVS_NO_THREAD_SAFETY_ANALYSIS
 {
 struct conn *conn;
@@ -1438,6 +1436,7 @@ ct_sweep(struct conntrack *ct, struct rculist *list, long 
long now)
 RCULIST_FOR_EACH (conn, node, list) {
 if (conn_expired(conn, now)) {
 conn_clean(ct, conn);
+(*cleaned_count) ++;
 }
 
 count++;
@@ -1454,7 +1453,7 @@ conntrack_clean(struct conntrack *ct, long long now)
 {
 long long next_wakeup = now + conntrack_get_sweep_interval(ct);
 unsigned int n_conn_limit, i;
-size_t clean_end, count = 0;
+size_t clean_end, count = 0, cleaned_count = 0;
 
 atomic_read_relaxed(>n_conn_limit, _conn_limit);
 clean_end = n_conn_limit / 64;
@@ -1465,13 +1464,13 @@ conntrack_clean(struct conntrack *ct, long long now)
 break;
 }
 
-count += ct_sweep(ct, >exp_lists[i], now);
+count += ct_sweep(ct, >exp_lists[i], now, _count);
 }
 
 ct->next_sweep = (i < N_EXP_LISTS) ? i : 0;
 
-VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec", count,
- time_msec() - now);
+VLOG_DBG("conntrack cleanup %"PRIuSIZE"/%"PRIuSIZE" entries in %lld msec",
+ cleaned_count, count, time_msec() - now);
 
 return next_wakeup;
 }
-- 
2.39.3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] upcall: Check flow consistant in upcall.

2024-02-19 Thread Cheng Li
Ovs ko passes odp key and packet to userspace. Next packet is
extracted into flow, which is the input for xlate to generate wc.
At last, ukey(= odp_key/wc) is installed into datapath. If the
odp_key is not consistant with packet extracted flow. The ukey
will be wrong.

commit [1] was created to fix inconsistance caused by bad tcp
header. commit [2] was cretaed to fix inconsistance caused by bad
ip header. There is no guarantee of the consistance of odp_key and
packet flow. So it is necessary to make the check to prevent from
installing wrong ukey.

[1] 1f5749c790accd98dbcafdaefc40bf5e52d7c672
[2] 79349cbab0b2a755140eedb91833ad2760520a83

Signed-off-by: Cheng Li 
---

Notes:
v2: Leverage avoid_caching to avoid ukey install.

 ofproto/ofproto-dpif-upcall.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
index b5cbeed87..bd93f0981 100644
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -66,6 +66,7 @@ COVERAGE_DEFINE(upcall_flow_limit_reduced);
 COVERAGE_DEFINE(upcall_flow_limit_scaled);
 COVERAGE_DEFINE(upcall_ukey_contention);
 COVERAGE_DEFINE(upcall_ukey_replace);
+COVERAGE_DEFINE(upcall_packet_flow_inconsistant);
 
 /* A thread that reads upcalls from dpif, forwards each upcall's packet,
  * and possibly sets up a kernel flow as a cache. */
@@ -840,6 +841,7 @@ recv_upcalls(struct handler *handler)
 struct dpif_upcall dupcalls[UPCALL_MAX_BATCH];
 struct upcall upcalls[UPCALL_MAX_BATCH];
 struct flow flows[UPCALL_MAX_BATCH];
+struct flow odp_key_flow;
 size_t n_upcalls, i;
 
 n_upcalls = 0;
@@ -903,6 +905,8 @@ recv_upcalls(struct handler *handler)
 upcall->out_tun_key = dupcall->out_tun_key;
 upcall->actions = dupcall->actions;
 
+/* Save odp flow before overwrite. */
+memcpy(_key_flow, flow, sizeof odp_key_flow);
 pkt_metadata_from_flow(>packet.md, flow);
 flow_extract(>packet, flow);
 
@@ -912,6 +916,12 @@ recv_upcalls(struct handler *handler)
 goto cleanup;
 }
 
+if (!flow_equal_except(_key_flow, flow, >wc)) {
+/* If odp flow is not consistant with flow extract from packet,
+ * bad ukey/mask will be installed. */
+COVERAGE_INC(upcall_packet_flow_inconsistant);
+upcall->xout.avoid_caching = true;
+}
 n_upcalls++;
 continue;
 
@@ -1376,6 +1386,10 @@ should_install_flow(struct udpif *udpif, struct upcall 
*upcall)
 return false;
 }
 
+if (upcall->xout.avoid_caching) {
+return false;
+}
+
 atomic_read_relaxed(>flow_limit, _limit);
 if (udpif_get_n_flows(udpif) >= flow_limit) {
 COVERAGE_INC(upcall_flow_limit_hit);
-- 
2.39.3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] upcall: Check flow consistant in upcall.

2024-02-18 Thread Cheng Li
Ovs ko passes odp key and packet to userspace. Next packet is
extracted into flow, which is the input for xlate to generate wc.
At last, ukey(= odp_key/wc) is installed into datapath. If the
odp_key is not consistant with packet extracted flow. The ukey
will be wrong.

commit [1] was created to fix inconsistance caused by bad tcp
header. commit [2] was cretaed to fix inconsistance caused by bad
ip header. There is no guarantee of the consistance of odp_key and
packet flow. So it is necessary to make the check to prevent from
installing wrong ukey.

[1] 1f5749c790accd98dbcafdaefc40bf5e52d7c672
[2] 79349cbab0b2a755140eedb91833ad2760520a83

Signed-off-by: Cheng Li 
---
 ofproto/ofproto-dpif-upcall.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
index b5cbeed87..6e46e5a5a 100644
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -66,6 +66,7 @@ COVERAGE_DEFINE(upcall_flow_limit_reduced);
 COVERAGE_DEFINE(upcall_flow_limit_scaled);
 COVERAGE_DEFINE(upcall_ukey_contention);
 COVERAGE_DEFINE(upcall_ukey_replace);
+COVERAGE_DEFINE(upcall_packet_flow_inconsistant);
 
 /* A thread that reads upcalls from dpif, forwards each upcall's packet,
  * and possibly sets up a kernel flow as a cache. */
@@ -840,6 +841,8 @@ recv_upcalls(struct handler *handler)
 struct dpif_upcall dupcalls[UPCALL_MAX_BATCH];
 struct upcall upcalls[UPCALL_MAX_BATCH];
 struct flow flows[UPCALL_MAX_BATCH];
+struct flow odp_flow;
+struct flow_wildcards flow_wc;
 size_t n_upcalls, i;
 
 n_upcalls = 0;
@@ -903,8 +906,17 @@ recv_upcalls(struct handler *handler)
 upcall->out_tun_key = dupcall->out_tun_key;
 upcall->actions = dupcall->actions;
 
+/* Save odp flow before overwrite. */
+memcpy(_flow, flow, sizeof flow);
 pkt_metadata_from_flow(>packet.md, flow);
 flow_extract(>packet, flow);
+flow_wildcards_init_for_packet(_wc, );
+if (!flow_equal_except(_flow, flow, _wc)) {
+/* If odp flow is not consistant with flow extract from packet,
+ * bad ukey/mask will be installed. */
+COVERAGE_INC(upcall_packet_flow_inconsistant);
+goto cleanup;
+}
 
 error = process_upcall(udpif, upcall,
>odp_actions, >wc);
-- 
2.39.3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] vconn: Count vconn_sent regardless of log level.

2024-01-06 Thread Cheng Li
vconn_sent counter is supposed to increase each time send() return
0, no matter if the vconn log debug is on or off.

Signed-off-by: Cheng Li 
---

Notes:
v2: Increase vconn_sent only if send() return 0.

 lib/vconn.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/vconn.c b/lib/vconn.c
index b55676227..e9603432d 100644
--- a/lib/vconn.c
+++ b/lib/vconn.c
@@ -682,7 +682,6 @@ do_send(struct vconn *vconn, struct ofpbuf *msg)
 
 ofpmsg_update_length(msg);
 if (!VLOG_IS_DBG_ENABLED()) {
-COVERAGE_INC(vconn_sent);
 retval = (vconn->vclass->send)(vconn, msg);
 } else {
 char *s = ofp_to_string(msg->data, msg->size, NULL, NULL, 1);
@@ -693,6 +692,9 @@ do_send(struct vconn *vconn, struct ofpbuf *msg)
 }
 free(s);
 }
+if (!retval) {
+COVERAGE_INC(vconn_sent);
+}
 return retval;
 }
 
-- 
2.39.3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] vconn: Count vconn_sent regardless of log level.

2024-01-05 Thread Cheng Li
On Thu, Jan 04, 2024 at 03:16:59PM +0100, Maximets wrote:
> On 1/4/24 14:53, Cheng Li wrote:
> > vconn_sent counter is supposed to increase each time send() is
> > called, no matter if the vconn log debug is on or off.
> 
> Good catch!  But I think the intention was fr it to be increased
> each time we successfully sent something, not when the function
> was called.  This will match the behavior of the vconn_received
> counter.  i.e. we should probably put the counter under if (!retval).
> 
> What do you think?

Make sense. I will send a v2.

> 
> Best regards, Ilya Maximets.
> 
> > 
> > Signed-off-by: Cheng Li 
> > ---
> >  lib/vconn.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/lib/vconn.c b/lib/vconn.c
> > index b556762..2513b03 100644
> > --- a/lib/vconn.c
> > +++ b/lib/vconn.c
> > @@ -681,8 +681,8 @@ do_send(struct vconn *vconn, struct ofpbuf *msg)
> >  ovs_assert(msg->size >= sizeof(struct ofp_header));
> >  
> >  ofpmsg_update_length(msg);
> > +COVERAGE_INC(vconn_sent);
> >  if (!VLOG_IS_DBG_ENABLED()) {
> > -COVERAGE_INC(vconn_sent);
> >  retval = (vconn->vclass->send)(vconn, msg);
> >  } else {
> >  char *s = ofp_to_string(msg->data, msg->size, NULL, NULL, 1);
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] vconn: Count vconn_sent regardless of log level.

2024-01-04 Thread Cheng Li
vconn_sent counter is supposed to increase each time send() is
called, no matter if the vconn log debug is on or off.

Signed-off-by: Cheng Li 
---
 lib/vconn.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/vconn.c b/lib/vconn.c
index b556762..2513b03 100644
--- a/lib/vconn.c
+++ b/lib/vconn.c
@@ -681,8 +681,8 @@ do_send(struct vconn *vconn, struct ofpbuf *msg)
 ovs_assert(msg->size >= sizeof(struct ofp_header));
 
 ofpmsg_update_length(msg);
+COVERAGE_INC(vconn_sent);
 if (!VLOG_IS_DBG_ENABLED()) {
-COVERAGE_INC(vconn_sent);
 retval = (vconn->vclass->send)(vconn, msg);
 } else {
 char *s = ofp_to_string(msg->data, msg->size, NULL, NULL, 1);
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] docs: Add HyperThreading notes for auto-lb usage.

2023-01-30 Thread Cheng Li
On Mon, Jan 30, 2023 at 05:51:57PM +0100, Ilya Maximets wrote:
> On 1/30/23 16:13, Cheng Li wrote:
> > On Mon, Jan 30, 2023 at 01:38:45PM +0100, Ilya Maximets wrote:
> >> On 1/29/23 02:33, Cheng Li wrote:
> >>> On Fri, Jan 27, 2023 at 04:04:55PM +0100, Ilya Maximets wrote:
> >>>> On 1/24/23 16:52, Kevin Traynor wrote:
> >>>>> On 08/01/2023 03:55, Cheng Li wrote:
> >>>>>> In my test, if one logical core is pinned to PMD thread while the
> >>>>>> other logical(of the same physical core) is not. The PMD
> >>>>>> performance is affected the by the not-pinned logical core load.
> >>>>>> This maks it difficult to estimate the loads during a dry-run.
> >>>>>>
> >>>>>> Signed-off-by: Cheng Li 
> >>>>>> ---
> >>>>>>   Documentation/topics/dpdk/pmd.rst | 4 
> >>>>>>   1 file changed, 4 insertions(+)
> >>>>>>
> >>>>>> diff --git a/Documentation/topics/dpdk/pmd.rst 
> >>>>>> b/Documentation/topics/dpdk/pmd.rst
> >>>>>> index 9006fd4..b220199 100644
> >>>>>> --- a/Documentation/topics/dpdk/pmd.rst
> >>>>>> +++ b/Documentation/topics/dpdk/pmd.rst
> >>>>>> @@ -312,6 +312,10 @@ If not set, the default variance improvement 
> >>>>>> threshold is 25%.
> >>>>>>   when all PMD threads are running on cores from a single NUMA 
> >>>>>> node. In this
> >>>>>>   case cross-NUMA datapaths will not change after reassignment.
> >>>>>>   +    For the same reason, please ensure that the pmd threads are 
> >>>>>> pinned to SMT
> >>>>>> +    siblings if HyperThreading is enabled. Otherwise, PMDs within a 
> >>>>>> NUMA may
> >>>>>> +    not have the same performance.
> >>>>
> >>>> Uhm... Am I reading this wrong or this note suggests to pin PMD threads
> >>>> to SMT siblings?  It sounds like that's the opposite of what you were
> >>>> trying to say.  Siblings are sharing the same physical core, so if some
> >>>> PMDs are pinned to siblings, the load prediction can not work correctly.
> >>>
> >>> Thanks for the review, Ilya.
> >>>
> >>> The note indeed suggests to pin PMD threads to sliblings. Sliblings share
> >>> the same physical core, if PMDs pin one slibling while leaving the other
> >>> slibling of the same physical core not pinned, the load prediction may
> >>> not work correctly. Because the pinned slibling performance may affected
> >>> by the not-pinned slibling workload. So we sugguest to pin both
> >>> sliblings of the same physical core.
> >>
> >> But this makes sense only if all the PMD threads are on siblings of the
> >> same physical core.  If more than one physical core is involved, the load
> >> calculations will be incorrect.  For example, let's say we have 4 threads
> >> A, B, C and D, where A and B are siblings and C and D are siblings.  And
> >> it happened that we have only 2 ports, both of which are assigned to A.
> >> It makes a huge difference whether we move one of the ports from A to B
> >> or if we move it from A to C.  It is an oversimplified example, but we
> >> can't rely on load calculations in general case if PMD threads are running
> >> on SMT siblings.
> > 
> > Thanks for the detail explanation, now I get your point.
> > 
> > In your example, PMD B,C,D having no rxq assigned will be in sleep, which
> > costs little CPU cycles. When a logical core(B) is sleeping, its
> > slibling core(A) uses most of the physical core resource, so it's
> > powerfull. If we move one port from A to B, one physical core is
> > running. If we move one port from A to C, two physical cores are
> > running. So the result perfromance will be huge different. Hope I
> > understand correctly.
> 
> Yes.
> 
> And even if they are not sleeping.  E.g. if each thread has 1 port
> to poll, and thread A has 3 ports to poll.  The calculated variance
> will not match the actual performance impact as the actual available
> CPU power is different across cores.

"CPU power is different across cores". How does this happen? Because of
cpufreq may be different across cores? Or because of different poll
count?
As I understand it, no matter how many rxq to poll, no matter the rxqs
are busy or f

Re: [ovs-dev] [PATCH] docs: Add HyperThreading notes for auto-lb usage.

2023-01-30 Thread Cheng Li
On Mon, Jan 30, 2023 at 01:38:45PM +0100, Ilya Maximets wrote:
> On 1/29/23 02:33, Cheng Li wrote:
> > On Fri, Jan 27, 2023 at 04:04:55PM +0100, Ilya Maximets wrote:
> >> On 1/24/23 16:52, Kevin Traynor wrote:
> >>> On 08/01/2023 03:55, Cheng Li wrote:
> >>>> In my test, if one logical core is pinned to PMD thread while the
> >>>> other logical(of the same physical core) is not. The PMD
> >>>> performance is affected the by the not-pinned logical core load.
> >>>> This maks it difficult to estimate the loads during a dry-run.
> >>>>
> >>>> Signed-off-by: Cheng Li 
> >>>> ---
> >>>>   Documentation/topics/dpdk/pmd.rst | 4 
> >>>>   1 file changed, 4 insertions(+)
> >>>>
> >>>> diff --git a/Documentation/topics/dpdk/pmd.rst 
> >>>> b/Documentation/topics/dpdk/pmd.rst
> >>>> index 9006fd4..b220199 100644
> >>>> --- a/Documentation/topics/dpdk/pmd.rst
> >>>> +++ b/Documentation/topics/dpdk/pmd.rst
> >>>> @@ -312,6 +312,10 @@ If not set, the default variance improvement 
> >>>> threshold is 25%.
> >>>>   when all PMD threads are running on cores from a single NUMA node. 
> >>>> In this
> >>>>   case cross-NUMA datapaths will not change after reassignment.
> >>>>   +    For the same reason, please ensure that the pmd threads are 
> >>>> pinned to SMT
> >>>> +    siblings if HyperThreading is enabled. Otherwise, PMDs within a 
> >>>> NUMA may
> >>>> +    not have the same performance.
> >>
> >> Uhm... Am I reading this wrong or this note suggests to pin PMD threads
> >> to SMT siblings?  It sounds like that's the opposite of what you were
> >> trying to say.  Siblings are sharing the same physical core, so if some
> >> PMDs are pinned to siblings, the load prediction can not work correctly.
> > 
> > Thanks for the review, Ilya.
> > 
> > The note indeed suggests to pin PMD threads to sliblings. Sliblings share
> > the same physical core, if PMDs pin one slibling while leaving the other
> > slibling of the same physical core not pinned, the load prediction may
> > not work correctly. Because the pinned slibling performance may affected
> > by the not-pinned slibling workload. So we sugguest to pin both
> > sliblings of the same physical core.
> 
> But this makes sense only if all the PMD threads are on siblings of the
> same physical core.  If more than one physical core is involved, the load
> calculations will be incorrect.  For example, let's say we have 4 threads
> A, B, C and D, where A and B are siblings and C and D are siblings.  And
> it happened that we have only 2 ports, both of which are assigned to A.
> It makes a huge difference whether we move one of the ports from A to B
> or if we move it from A to C.  It is an oversimplified example, but we
> can't rely on load calculations in general case if PMD threads are running
> on SMT siblings.

Thanks for the detail explanation, now I get your point.

In your example, PMD B,C,D having no rxq assigned will be in sleep, which
costs little CPU cycles. When a logical core(B) is sleeping, its
slibling core(A) uses most of the physical core resource, so it's
powerfull. If we move one port from A to B, one physical core is
running. If we move one port from A to C, two physical cores are
running. So the result perfromance will be huge different. Hope I
understand correctly.

To cover this case, one choice is to use only one of the sliblings while
leaving the other slibling unused(isolate). I have ever done some tests,
using both sliblings have 25% performance improvement than using only
one slibling while leaving the other slibing unused. So this may not be
a good choice.

> 
> > 
> > 
> >>
> >> Nit: s/pmd/PMD/
> >>
> >> Best regards, Ilya Maximets.
> >>
> >>>> +
> >>>>   The minimum time between 2 consecutive PMD auto load balancing 
> >>>> iterations can
> >>>>   also be configured by::
> >>>>   
> >>>
> >>> I don't think it's a hard requirement as siblings should not impact as 
> >>> much as cross-numa might but it's probably good advice in general.
> >>>
> >>> Acked-by: Kevin Traynor 
> >>>
> >>> ___
> >>> dev mailing list
> >>> d...@openvswitch.org
> >>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >>>
> >>
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH] docs: Add HyperThreading notes for auto-lb usage.

2023-01-28 Thread Cheng Li
On Fri, Jan 27, 2023 at 04:04:55PM +0100, Ilya Maximets wrote:
> On 1/24/23 16:52, Kevin Traynor wrote:
> > On 08/01/2023 03:55, Cheng Li wrote:
> >> In my test, if one logical core is pinned to PMD thread while the
> >> other logical(of the same physical core) is not. The PMD
> >> performance is affected the by the not-pinned logical core load.
> >> This maks it difficult to estimate the loads during a dry-run.
> >>
> >> Signed-off-by: Cheng Li 
> >> ---
> >>   Documentation/topics/dpdk/pmd.rst | 4 
> >>   1 file changed, 4 insertions(+)
> >>
> >> diff --git a/Documentation/topics/dpdk/pmd.rst 
> >> b/Documentation/topics/dpdk/pmd.rst
> >> index 9006fd4..b220199 100644
> >> --- a/Documentation/topics/dpdk/pmd.rst
> >> +++ b/Documentation/topics/dpdk/pmd.rst
> >> @@ -312,6 +312,10 @@ If not set, the default variance improvement 
> >> threshold is 25%.
> >>   when all PMD threads are running on cores from a single NUMA node. 
> >> In this
> >>   case cross-NUMA datapaths will not change after reassignment.
> >>   +    For the same reason, please ensure that the pmd threads are pinned 
> >> to SMT
> >> +    siblings if HyperThreading is enabled. Otherwise, PMDs within a NUMA 
> >> may
> >> +    not have the same performance.
> 
> Uhm... Am I reading this wrong or this note suggests to pin PMD threads
> to SMT siblings?  It sounds like that's the opposite of what you were
> trying to say.  Siblings are sharing the same physical core, so if some
> PMDs are pinned to siblings, the load prediction can not work correctly.

Thanks for the review, Ilya.

The note indeed suggests to pin PMD threads to sliblings. Sliblings share
the same physical core, if PMDs pin one slibling while leaving the other
slibling of the same physical core not pinned, the load prediction may
not work correctly. Because the pinned slibling performance may affected
by the not-pinned slibling workload. So we sugguest to pin both
sliblings of the same physical core.


> 
> Nit: s/pmd/PMD/
> 
> Best regards, Ilya Maximets.
> 
> >> +
> >>   The minimum time between 2 consecutive PMD auto load balancing 
> >> iterations can
> >>   also be configured by::
> >>   
> > 
> > I don't think it's a hard requirement as siblings should not impact as much 
> > as cross-numa might but it's probably good advice in general.
> > 
> > Acked-by: Kevin Traynor 
> > 
> > ___
> > dev mailing list
> > d...@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> > 
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH] docs: Add HyperThreading notes for auto-lb usage.

2023-01-07 Thread Cheng Li
In my test, if one logical core is pinned to PMD thread while the
other logical(of the same physical core) is not. The PMD
performance is affected the by the not-pinned logical core load.
This maks it difficult to estimate the loads during a dry-run.

Signed-off-by: Cheng Li 
---
 Documentation/topics/dpdk/pmd.rst | 4 
 1 file changed, 4 insertions(+)

diff --git a/Documentation/topics/dpdk/pmd.rst 
b/Documentation/topics/dpdk/pmd.rst
index 9006fd4..b220199 100644
--- a/Documentation/topics/dpdk/pmd.rst
+++ b/Documentation/topics/dpdk/pmd.rst
@@ -312,6 +312,10 @@ If not set, the default variance improvement threshold is 
25%.
 when all PMD threads are running on cores from a single NUMA node. In this
 case cross-NUMA datapaths will not change after reassignment.
 
+For the same reason, please ensure that the pmd threads are pinned to SMT
+siblings if HyperThreading is enabled. Otherwise, PMDs within a NUMA may
+not have the same performance.
+
 The minimum time between 2 consecutive PMD auto load balancing iterations can
 also be configured by::
 
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v2] dpif-netdev: calculate per numa variance

2022-12-17 Thread Cheng Li
Currently, pmd_rebalance_dry_run() calculate overall variance of
all pmds regardless of their numa location. The overall result may
hide un-balance in an individual numa.

Considering the following case. Numa0 is free because VMs on numa0
are not sending pkts, while numa1 is busy. Within numa1, pmds
workloads are not balanced. Obviously, moving 500 kpps workloads from
pmd 126 to pmd 62 will make numa1 much more balance. For numa1
the variance improment will be almost 100%, because after rebalance
each pmd in numa1 holds same workload(variance ~= 0). But the overall
variance improvement is only about 20%, which may not tigger auto_lb.

```
numa_id   core_id  kpps
  030 0
  031 0
  094 0
  095 0
  1   126  1500
  1   127  1000
  163  1000
  162   500
```

As auto_lb doesn't balance workload across numa nodes. So it makes
more sense to calculate variance improvemnet per numa node.

Signed-off-by: Cheng Li 
Signed-off-by: Kevin Traynor 
Co-authored-by: Kevin Traynor 
---

Notes:
v2:
- Commit msg update
- Update doc for per numa variance
- Reword variance improment log msg
- Not break numa variance loop for debug purpose

 Documentation/topics/dpdk/pmd.rst |  8 ++--
 lib/dpif-netdev.c | 85 +++
 2 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/Documentation/topics/dpdk/pmd.rst 
b/Documentation/topics/dpdk/pmd.rst
index b259cc8..c335757 100644
--- a/Documentation/topics/dpdk/pmd.rst
+++ b/Documentation/topics/dpdk/pmd.rst
@@ -278,10 +278,10 @@ If a PMD core is detected to be above the load threshold 
and the minimum
 pre-requisites are met, a dry-run using the current PMD assignment algorithm is
 performed.
 
-The current variance of load between the PMD cores and estimated variance from
-the dry-run are both calculated. If the estimated dry-run variance is improved
-from the current one by the variance threshold, a new Rx queue to PMD
-assignment will be performed.
+For each numa node, the current variance of load between the PMD cores and
+estimated variance from the dry-run are both calculated. If any numa's
+estimated dry-run variance is improved from the current one by the variance
+threshold, a new Rx queue to PMD assignment will be performed.
 
 For example, to set the variance improvement threshold to 40%::
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 2c08a71..7ff923b 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -6076,39 +6076,33 @@ rxq_scheduling(struct dp_netdev *dp)
 static uint64_t variance(uint64_t a[], int n);
 
 static uint64_t
-sched_numa_list_variance(struct sched_numa_list *numa_list)
+sched_numa_variance(struct sched_numa *numa)
 {
-struct sched_numa *numa;
 uint64_t *percent_busy = NULL;
-unsigned total_pmds = 0;
 int n_proc = 0;
 uint64_t var;
 
-HMAP_FOR_EACH (numa, node, _list->numas) {
-total_pmds += numa->n_pmds;
-percent_busy = xrealloc(percent_busy,
-total_pmds * sizeof *percent_busy);
+percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy);
 
-for (unsigned i = 0; i < numa->n_pmds; i++) {
-struct sched_pmd *sched_pmd;
-uint64_t total_cycles = 0;
+for (unsigned i = 0; i < numa->n_pmds; i++) {
+struct sched_pmd *sched_pmd;
+uint64_t total_cycles = 0;
 
-sched_pmd = >pmds[i];
-/* Exclude isolated PMDs from variance calculations. */
-if (sched_pmd->isolated == true) {
-continue;
-}
-/* Get the total pmd cycles for an interval. */
-atomic_read_relaxed(_pmd->pmd->intrvl_cycles, _cycles);
+sched_pmd = >pmds[i];
+/* Exclude isolated PMDs from variance calculations. */
+if (sched_pmd->isolated == true) {
+continue;
+}
+/* Get the total pmd cycles for an interval. */
+atomic_read_relaxed(_pmd->pmd->intrvl_cycles, _cycles);
 
-if (total_cycles) {
-/* Estimate the cycles to cover all intervals. */
-total_cycles *= PMD_INTERVAL_MAX;
-percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
- / total_cycles;
-} else {
-percent_busy[n_proc++] = 0;
-}
+if (total_cycles) {
+/* Estimate the cycles to cover all intervals. */
+total_cycles *= PMD_INTERVAL_MAX;
+percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
+/ total_cycles;
+} else {
+percent_busy[n_proc++] = 0;
 }
 }
 var = variance(percent_busy, n_proc);
@@ -6182,6 +6176,7 @@ pmd_rebalance_dry_ru

Re: [ovs-dev] [PATCH] dpif-netdev: calculate per numa variance

2022-12-09 Thread Cheng Li
On Fri, Dec 09, 2022 at 03:32:06PM +, Kevin Traynor wrote:
> On 04/12/2022 11:16, Cheng Li wrote:
> >Currently, pmd_rebalance_dry_run() calculate overall variance of
> >all pmds regardless of their numa location. The overall result may
> >hide un-balance in an individual numa.
> >
> 
> Hi. Thanks, the idea looks a good one. I didn't test yet, couple of
> comments on the code below.

Hi Kevin, thanks for the review.

> 
> >Considering the following case. Numa 0 is free because VMs on numa0
> >are not sending pkts, while numa 1 is busy. Within numa 1, pmds
> >workloads are not balanced. Obviously, moving 500 kpps workloads from
> >pmd 126 to pmd 61 will make numa1 much more balance. For numa1
> >the variance improment will be almost 100%, because after rebalance
> >each pmd in numa1 holds same workload(variance ~= 0). But the overall
> >variance improvement is only about 20%, which may not tigger auto_lb.
> >
> >```
> >numa_id   core_id  kpps
> >   030 0
> >   031 0
> >   094 0
> >   095 0
> >   1   126  1500
> >   1   127  1000
> >   163  1000
> >   162   500
> >```
> >
> >As auto_lb doesn't work if any coss_numa rxq exists, it means that
> 
> That's not fully true. It can run with cross numa in a very limited
> circumstances where there is only PMD active on 1 numa.
> 
> It doesn't diminish the idea here, but just best not write that
> blanket statement as it may confuse.

Makes sense, I will reword the description.

> 
> >auto_lb only balance rxq assignment within each numa. So it makes
> >more sense to calculate variance improvemnet per numa node.
> >
> >Signed-off-by: Cheng Li 
> >---
> >  lib/dpif-netdev.c | 90 
> > +--
> >  1 file changed, 47 insertions(+), 43 deletions(-)
> >
> 
> I think at least some docs would need to be updated to say it's
> variance per NUMA.

After going through the doc directory, I found two pleases:
1. Documentation/topics/dpdk/pmd.rst
2. NEWS

> 
> >diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> >index 2c08a71..6a53f13 100644
> >--- a/lib/dpif-netdev.c
> >+++ b/lib/dpif-netdev.c
> >@@ -6076,39 +6076,33 @@ rxq_scheduling(struct dp_netdev *dp)
> >  static uint64_t variance(uint64_t a[], int n);
> >  static uint64_t
> >-sched_numa_list_variance(struct sched_numa_list *numa_list)
> >+sched_numa_variance(struct sched_numa *numa)
> >  {
> >-struct sched_numa *numa;
> >  uint64_t *percent_busy = NULL;
> >-unsigned total_pmds = 0;
> >  int n_proc = 0;
> >  uint64_t var;
> >-HMAP_FOR_EACH (numa, node, _list->numas) {
> >-total_pmds += numa->n_pmds;
> >-percent_busy = xrealloc(percent_busy,
> >-total_pmds * sizeof *percent_busy);
> >+percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy);
> >-for (unsigned i = 0; i < numa->n_pmds; i++) {
> >-struct sched_pmd *sched_pmd;
> >-uint64_t total_cycles = 0;
> >+for (unsigned i = 0; i < numa->n_pmds; i++) {
> >+struct sched_pmd *sched_pmd;
> >+uint64_t total_cycles = 0;
> >-sched_pmd = >pmds[i];
> >-/* Exclude isolated PMDs from variance calculations. */
> >-if (sched_pmd->isolated == true) {
> >-continue;
> >-}
> >-/* Get the total pmd cycles for an interval. */
> >-atomic_read_relaxed(_pmd->pmd->intrvl_cycles, 
> >_cycles);
> >+sched_pmd = >pmds[i];
> >+/* Exclude isolated PMDs from variance calculations. */
> >+if (sched_pmd->isolated == true) {
> >+continue;
> >+}
> >+/* Get the total pmd cycles for an interval. */
> >+atomic_read_relaxed(_pmd->pmd->intrvl_cycles, _cycles);
> >-if (total_cycles) {
> >-/* Estimate the cycles to cover all intervals. */
> >-total_cycles *= PMD_INTERVAL_MAX;
> >-percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
> >- / total_cycles;
> >-} else {
> >-percent_busy[n_proc++] = 0;
> >-}
> >+if (total_cycles) {
> >+/* Estimate the cycles to cover all intervals. *

[ovs-dev] [PATCH] dpif-netdev: calculate per numa variance

2022-12-04 Thread Cheng Li
Currently, pmd_rebalance_dry_run() calculate overall variance of
all pmds regardless of their numa location. The overall result may
hide un-balance in an individual numa.

Considering the following case. Numa 0 is free because VMs on numa0
are not sending pkts, while numa 1 is busy. Within numa 1, pmds
workloads are not balanced. Obviously, moving 500 kpps workloads from
pmd 126 to pmd 61 will make numa1 much more balance. For numa1
the variance improment will be almost 100%, because after rebalance
each pmd in numa1 holds same workload(variance ~= 0). But the overall
variance improvement is only about 20%, which may not tigger auto_lb.

```
numa_id   core_id  kpps
  030 0
  031 0
  094 0
  095 0
  1   126  1500
  1   127  1000
  163  1000
  162   500
```

As auto_lb doesn't work if any coss_numa rxq exists, it means that
auto_lb only balance rxq assignment within each numa. So it makes
more sense to calculate variance improvemnet per numa node.

Signed-off-by: Cheng Li 
---
 lib/dpif-netdev.c | 90 +--
 1 file changed, 47 insertions(+), 43 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 2c08a71..6a53f13 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -6076,39 +6076,33 @@ rxq_scheduling(struct dp_netdev *dp)
 static uint64_t variance(uint64_t a[], int n);
 
 static uint64_t
-sched_numa_list_variance(struct sched_numa_list *numa_list)
+sched_numa_variance(struct sched_numa *numa)
 {
-struct sched_numa *numa;
 uint64_t *percent_busy = NULL;
-unsigned total_pmds = 0;
 int n_proc = 0;
 uint64_t var;
 
-HMAP_FOR_EACH (numa, node, _list->numas) {
-total_pmds += numa->n_pmds;
-percent_busy = xrealloc(percent_busy,
-total_pmds * sizeof *percent_busy);
+percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy);
 
-for (unsigned i = 0; i < numa->n_pmds; i++) {
-struct sched_pmd *sched_pmd;
-uint64_t total_cycles = 0;
+for (unsigned i = 0; i < numa->n_pmds; i++) {
+struct sched_pmd *sched_pmd;
+uint64_t total_cycles = 0;
 
-sched_pmd = >pmds[i];
-/* Exclude isolated PMDs from variance calculations. */
-if (sched_pmd->isolated == true) {
-continue;
-}
-/* Get the total pmd cycles for an interval. */
-atomic_read_relaxed(_pmd->pmd->intrvl_cycles, _cycles);
+sched_pmd = >pmds[i];
+/* Exclude isolated PMDs from variance calculations. */
+if (sched_pmd->isolated == true) {
+continue;
+}
+/* Get the total pmd cycles for an interval. */
+atomic_read_relaxed(_pmd->pmd->intrvl_cycles, _cycles);
 
-if (total_cycles) {
-/* Estimate the cycles to cover all intervals. */
-total_cycles *= PMD_INTERVAL_MAX;
-percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
- / total_cycles;
-} else {
-percent_busy[n_proc++] = 0;
-}
+if (total_cycles) {
+/* Estimate the cycles to cover all intervals. */
+total_cycles *= PMD_INTERVAL_MAX;
+percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
+/ total_cycles;
+} else {
+percent_busy[n_proc++] = 0;
 }
 }
 var = variance(percent_busy, n_proc);
@@ -6182,6 +6176,7 @@ pmd_rebalance_dry_run(struct dp_netdev *dp)
 struct sched_numa_list numa_list_est;
 bool thresh_met = false;
 uint64_t current_var, estimate_var;
+struct sched_numa *numa_cur, *numa_est;
 uint64_t improvement = 0;
 
 VLOG_DBG("PMD auto load balance performing dry run.");
@@ -6200,24 +6195,33 @@ pmd_rebalance_dry_run(struct dp_netdev *dp)
 sched_numa_list_count(_list_est) == 1) {
 
 /* Calculate variances. */
-current_var = sched_numa_list_variance(_list_cur);
-estimate_var = sched_numa_list_variance(_list_est);
-
-if (estimate_var < current_var) {
- improvement = ((current_var - estimate_var) * 100) / current_var;
-}
-VLOG_DBG("Current variance %"PRIu64" Estimated variance %"PRIu64".",
- current_var, estimate_var);
-VLOG_DBG("Variance improvement %"PRIu64"%%.", improvement);
-
-if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
-thresh_met = true;
-VLOG_DBG("PMD load variance improvement threshold %u%% "
- "is met.", dp->pmd_alb.rebal

[ovs-dev] [PATCH] m4: test avx512 for x86 only

2022-09-16 Thread Cheng Li
'as' command of arm version may don't support option '--64', this
patch is to move the avx512 test into x86 branch to avoid this.

Signed-off-by: Cheng Li 
---
 m4/openvswitch.m4 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/m4/openvswitch.m4 b/m4/openvswitch.m4
index b5be5842..14d9249 100644
--- a/m4/openvswitch.m4
+++ b/m4/openvswitch.m4
@@ -494,8 +494,8 @@ AC_DEFUN([OVS_CHECK_BINUTILS_AVX512],
  mkdir -p build-aux
  OBJFILE=build-aux/binutils_avx512_check.o
  GATHER_PARAMS='0x8(,%ymm1,1),%ymm0{%k2}'
- echo "vpgatherqq $GATHER_PARAMS" | as --64 -o $OBJFILE -
  if ($CC -dumpmachine | grep x86_64) >/dev/null 2>&1; then
+   echo "vpgatherqq $GATHER_PARAMS" | as --64 -o $OBJFILE -
if (objdump -d  --no-show-raw-insn $OBJFILE | grep -q $GATHER_PARAMS) 
>/dev/null 2>&1; then
  ovs_cv_binutils_avx512_good=yes
else
@@ -504,11 +504,11 @@ AC_DEFUN([OVS_CHECK_BINUTILS_AVX512],
  dnl and causing zmm usage with buggy binutils versions.
  CFLAGS="$CFLAGS -mno-avx512f"
fi
+   rm $OBJFILE
  else
dnl non x86_64 architectures don't have avx512, so not affected
ovs_cv_binutils_avx512_good=no
  fi])
- rm $OBJFILE
if test "$ovs_cv_binutils_avx512_good" = yes; then
  AC_DEFINE([HAVE_LD_AVX512_GOOD], [1],
[Define to 1 if binutils correctly supports AVX512.])
-- 
1.8.3.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] ovs dpdk multiple queues donot work

2022-09-06 Thread Cheng Li
On Tue, Sep 06, 2022 at 02:36:39AM +, 孟熠华 wrote:
> Hi guys,
>I meet a question about ovs dpdk multiple queues.
>When testing tcp band width with iperf3 or qperf ,all packets are only 
> in one rxq.And the bw result is too low.
>Would you please help me?

I guess you have only one pmd, or only one pmd is forwarding.
In version 2.11, one pmd only output pkts to a single txq(for VM
it's rxq).

> 
> Version
>ovs-vsctl (Open vSwitch) 2.11.0
>DPDK 18.11.9
> 
> Configration
>[root@controller-node openvswitch-2.11.0]# ovs-vsctl list open_vswitch
> _uuid   : 154520f8-47b1-4dbb-ade1-650a9dac8340
> bridges : [3878e475-e959-4a6d-82f8-b931d8cd43d5, 
> 8f1e93a1-f844-4027-97fa-276e32a9b555, 9d08bcc7-cfb7-434b-8247-7b6c02487e07]
> cur_cfg : 151
> datapath_types  : [netdev, system]
> db_version  : []
> dpdk_initialized: true
> dpdk_version: "DPDK 18.11.9"
> external_ids: {hostname=controller-node}
> iface_types : [dpdk, dpdkr, dpdkvhostuser, dpdkvhostuserclient, 
> erspan, geneve, gre, internal, "ip6erspan", "ip6gre", lisp, patch, stt, 
> system, tap, vxlan]
> manager_options : []
> next_cfg: 151
> other_config: {dpdk-init="true", dpdk-socket-mem="4096,0", 
> pmd-auto-lb="true", pmd-auto-lb-rebal-interval="10", 
> pmd-cpu-mask="0xf00", pmd-perf-metrics="true", pmd-rxq-assign=cycles, 
> vhost-iommu-support="true", vhost-sock-dir="."}
> ovs_version : []
> ssl : []
> statistics  : {}
> system_type : []
> system_version  : []
> 
> [root@controller-node openvswitch-2.11.0]# ovs-vsctl list Interface dpdk0
> _uuid   : 70193180-a43b-4c5a-9013-f9004f20950a
> admin_state : up
> bfd : {}
> bfd_status  : {}
> cfm_fault   : []
> cfm_fault_status: []
> cfm_flap_count  : []
> cfm_health  : []
> cfm_mpid: []
> cfm_remote_mpids: []
> cfm_remote_opstate  : []
> duplex  : full
> error   : []
> external_ids: {}
> ifindex : 10816857
> ingress_policing_burst: 0
> ingress_policing_rate: 0
> lacp_current: []
> link_resets : 2
> link_speed  : 100
> link_state  : up
> lldp: {}
> mac : []
> mac_in_use  : "b4:96:91:49:56:28"
> mtu : 1500
> mtu_request : 1500
> name: "dpdk0"
> ofport  : 1
> ofport_request  : []
> options : {dpdk-devargs=":02:00.0", n_rxq="4", 
> n_rxq_desc="2048", n_txq="1"}
> other_config: {}
> statistics  : {flow_director_filter_add_errors=164803, 
> flow_director_filter_remove_errors=456422, mac_local_errors=0, 
> mac_remote_errors=0, "rx_128_to_255_packets"=29913, 
> "rx_1_to_64_packets"=23786, "rx_256_to_511_packets"=23611, 
> "rx_512_to_1023_packets"=10639, "rx_65_to_127_packets"=1288610, 
> rx_broadcast_packets=10181, rx_bytes=271122791632, rx_crc_errors=26168302729, 
> rx_dropped=0, rx_errors=0, rx_fcoe_crc_errors=68, rx_fcoe_dropped=0, 
> rx_fcoe_mbuf_allocation_errors=0, rx_fragment_errors=0, 
> rx_illegal_byte_errors=0, rx_jabber_errors=0, rx_length_errors=0, 
> rx_mac_short_packet_dropped=45097959, rx_management_dropped=10639, 
> rx_management_packets=23611, rx_mbuf_allocation_errors=0, rx_missed_errors=0, 
> rx_oversize_errors=1288610, rx_packets=180342635, "rx_priority0_dropped"=0, 
> "rx_priority0_mbuf_allocation_errors"=0, "rx_priority1_dropped"=0, 
> "rx_priority1_mbuf_allocation_errors"=16472, "rx_priority2_dropped"=0, 
> "rx_priority2_mbuf_allocation_errors"=0, "rx_priority3_dropped"=0, 
> "rx_priority3_mbuf_allocation_errors"=0, "rx_priority4_dropped"=0, 
> "rx_priority4_mbuf_allocation_errors"=0, "rx_priority5_dropped"=0, 
> "rx_priority5_mbuf_allocation_errors"=0, "rx_priority6_dropped"=0, 
> "rx_priority6_mbuf_allocation_errors"=0, "rx_priority7_dropped"=0, 
> "rx_priority7_mbuf_allocation_errors"=0, rx_undersize_errors=23786, 
> "tx_128_to_255_packets"=1174316, "tx_1_to_64_packets"=119, 
> "tx_256_to_511_packets"=164803, "tx_512_to_1023_packets"=456422, 
> "tx_65_to_127_packets"=28554784, tx_broadcast_packets=68, 
> tx_bytes=26168304871, tx_dropped=0, tx_errors=0, 
> tx_management_packets=178966076, tx_multicast_packets=7, tx_packets=45097959}
> status  : {driver_name=net_ixgbe, if_descr="DPDK 18.11.9 
> net_ixgbe", if_type="6", link_speed="10Gbps", max_hash_mac_addrs="4096", 
> max_mac_addrs="127", max_rx_pktlen="1518", max_rx_queues="128", 
> max_tx_queues="64", max_vfs="0", max_vmdq_pools="64", min_rx_bufsize="1024", 
> numa_id="0", pci-device_id="0x1528", pci-vendor_id="0x8086", port_no="0"}
> type: dpdk
> 
> [root@controller-node openvswitch-2.11.0]# ovs-vsctl list Interface 
> vhost-user1
> _uuid   : 0cc484b0-39a8-4df4-83fe-50eb629f96bd
> admin_state : up
> bfd