The ovn-controller had a race condition over MAC
binding table with other controllers. When multiple
controllers received GARP from single source usually
the one who was able to win the race put it into SB.
The others got transaction error which triggered
full recompute even if it's not needed.

In order to reduce the chance of multiple controllers
trying to insert same row at the same time add slight
delay to the MAC binding processing. This delay
is random interval between 1-50 in ms. This greatly
reduces the chance that multiple controllers will try
to add MAC binding at exactly the same time. This applies
only to multicast ARP request which applies only to GARP
that is sent to broadcast address.

Local testing with this delay vs without show significantly
reduced chance of hitting the transaction error.

During the testing 10 GARPs was sent to two controllers
at the same time. Without proposed fix at least one controller
had multiple transaction errors present every test iteration.

With the proposed fix the transaction error was reduced to a
single one when it happened which was usually in less than
10% of the iterations.

As was mentioned before the race condition can still happen,
but the chance is greatly reduced.

Suggested-by: Daniel Alvarez Sanchez <dalva...@redhat.com>
Signed-off-by: Ales Musil <amu...@redhat.com>
---
v3: Rebase on top of current main.
    Change the delay to be per ARP request instead of
    single delay for everything. This allows the possibility
    to delay only multicast/broadcast AP as suggested by Han.
---
 controller/mac-learn.c | 23 ++++++++++++++++++++++-
 controller/mac-learn.h |  7 ++++++-
 controller/pinctrl.c   | 33 ++++++++++++++++++++++++---------
 tests/ovn.at           |  7 +++++++
 4 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/controller/mac-learn.c b/controller/mac-learn.c
index 27634dca8..b7b392bb1 100644
--- a/controller/mac-learn.c
+++ b/controller/mac-learn.c
@@ -20,12 +20,15 @@
 /* OpenvSwitch lib includes. */
 #include "openvswitch/vlog.h"
 #include "lib/packets.h"
+#include "lib/random.h"
 #include "lib/smap.h"
+#include "lib/timeval.h"
 
 VLOG_DEFINE_THIS_MODULE(mac_learn);
 
 #define MAX_MAC_BINDINGS 1000
 #define MAX_FDB_ENTRIES  1000
+#define MAX_MAC_BINDING_DELAY_MSEC 50
 
 static size_t mac_binding_hash(uint32_t dp_key, uint32_t port_key,
                                struct in6_addr *);
@@ -64,7 +67,7 @@ ovn_mac_bindings_destroy(struct hmap *mac_bindings)
 struct mac_binding *
 ovn_mac_binding_add(struct hmap *mac_bindings, uint32_t dp_key,
                     uint32_t port_key, struct in6_addr *ip,
-                    struct eth_addr mac)
+                    struct eth_addr mac, bool is_delayed)
 {
     uint32_t hash = mac_binding_hash(dp_key, port_key, ip);
 
@@ -79,6 +82,11 @@ ovn_mac_binding_add(struct hmap *mac_bindings, uint32_t 
dp_key,
         mb->dp_key = dp_key;
         mb->port_key = port_key;
         mb->ip = *ip;
+        mb->delay = 0;
+        mb->created = time_msec();
+        if (is_delayed) {
+            mb->delay = random_range(MAX_MAC_BINDING_DELAY_MSEC) + 1;
+        }
         hmap_insert(mac_bindings, &mb->hmap_node, hash);
     }
     mb->mac = mac;
@@ -86,6 +94,19 @@ ovn_mac_binding_add(struct hmap *mac_bindings, uint32_t 
dp_key,
     return mb;
 }
 
+long long
+ovn_mac_binding_shortest_delay(struct hmap *mac_bindings)
+{
+    long long min = LLONG_MAX;
+
+    struct mac_binding *mb;
+    HMAP_FOR_EACH (mb, hmap_node, mac_bindings) {
+        min = MIN(mb->delay, min);
+    }
+
+    return min;
+}
+
 /* fdb functions. */
 void
 ovn_fdb_init(struct hmap *fdbs)
diff --git a/controller/mac-learn.h b/controller/mac-learn.h
index e7e8ba2d3..d306bfa5a 100644
--- a/controller/mac-learn.h
+++ b/controller/mac-learn.h
@@ -31,16 +31,21 @@ struct mac_binding {
 
     /* Value. */
     struct eth_addr mac;
+
+    /* Delay for multicast ARP. */
+    long long created;
+    uint32_t delay;
 };
 
 void ovn_mac_bindings_init(struct hmap *mac_bindings);
 void ovn_mac_bindings_flush(struct hmap *mac_bindings);
 void ovn_mac_bindings_destroy(struct hmap *mac_bindings);
+long long ovn_mac_binding_shortest_delay(struct hmap *mac_bindings);
 
 struct mac_binding *ovn_mac_binding_add(struct hmap *mac_bindings,
                                         uint32_t dp_key, uint32_t port_key,
                                         struct in6_addr *ip,
-                                        struct eth_addr mac);
+                                        struct eth_addr mac, bool is_delayed);
 
 
 
diff --git a/controller/pinctrl.c b/controller/pinctrl.c
index f954362b7..9c615ebd5 100644
--- a/controller/pinctrl.c
+++ b/controller/pinctrl.c
@@ -4134,9 +4134,13 @@ pinctrl_handle_put_mac_binding(const struct flow *md,
         memcpy(&ip_key, &ip6, sizeof ip_key);
     }
 
+    /* If the ARP reply was unicast we should not delay it,
+     * there won't be any race. */
+    bool is_multicast = eth_addr_is_multicast(headers->dl_dst);
     struct mac_binding *mb = ovn_mac_binding_add(&put_mac_bindings, dp_key,
                                                  port_key, &ip_key,
-                                                 headers->dl_src);
+                                                 headers->dl_src,
+                                                 is_multicast);
     if (!mb) {
         COVERAGE_INC(pinctrl_drop_put_mac_binding);
         return;
@@ -4296,14 +4300,23 @@ run_put_mac_bindings(struct ovsdb_idl_txn 
*ovnsb_idl_txn,
         return;
     }
 
-    const struct mac_binding *mb;
-    HMAP_FOR_EACH (mb, hmap_node, &put_mac_bindings) {
-        run_put_mac_binding(ovnsb_idl_txn, sbrec_datapath_binding_by_key,
-                            sbrec_port_binding_by_key,
-                            sbrec_mac_binding_by_lport_ip,
-                            mb);
+    long long now = time_msec();
+
+    struct mac_binding *mb;
+    HMAP_FOR_EACH_SAFE (mb, hmap_node, &put_mac_bindings) {
+            long long elapsed = now - mb->created;
+            if (elapsed >= mb->delay) {
+                run_put_mac_binding(ovnsb_idl_txn,
+                                    sbrec_datapath_binding_by_key,
+                                    sbrec_port_binding_by_key,
+                                    sbrec_mac_binding_by_lport_ip,
+                                    mb);
+                hmap_remove(&put_mac_bindings, &mb->hmap_node);
+                free(mb);
+            } else {
+                mb->delay -= elapsed;
+            }
     }
-    ovn_mac_bindings_flush(&put_mac_bindings);
 }
 
 static void
@@ -4352,9 +4365,11 @@ run_buffered_binding(struct ovsdb_idl_index 
*sbrec_mac_binding_by_lport_ip,
 
 static void
 wait_put_mac_bindings(struct ovsdb_idl_txn *ovnsb_idl_txn)
+    OVS_REQUIRES(pinctrl_mutex)
 {
     if (ovnsb_idl_txn && !hmap_is_empty(&put_mac_bindings)) {
-        poll_immediate_wake();
+        long long delay = ovn_mac_binding_shortest_delay(&put_mac_bindings);
+        poll_timer_wait(delay);
     }
 }
 
diff --git a/tests/ovn.at b/tests/ovn.at
index c346975e6..af82024ac 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -19601,6 +19601,7 @@ data=0800bee4391a0001
 
 send_icmp_packet 1 1 $src_mac $router_mac0 $src_ip $dst_ip 0d1c $data
 send_arp_reply 2 1 $dst_mac $router_mac1 $dst_ip $router_ip
+OVS_WAIT_UNTIL([ovn-sbctl list mac_binding | grep -q "172.16.1.10"])
 echo $(get_arp_req $router_mac1 $router_ip $dst_ip) > expected
 echo 
"${dst_mac}${router_mac1}08004500001c00004000fe0122b5${router_ip}${dst_ip}${data}"
 >> expected
 
@@ -19641,6 +19642,7 @@ gw_router_mac=f00000010a0a
 send_icmp_packet 2 2 f00000110203 $router_mac0 $src_ip $dst_ip 0c1b $data
 echo $(get_arp_req f00000010204 $fip_ip $gw_router_ip) >> expected
 send_arp_reply 2 1 $gw_router_mac f00000010204 $gw_router_ip $fip_ip
+OVS_WAIT_UNTIL([ovn-sbctl list mac_binding | grep -q "172.16.1.11"])
 echo 
"${gw_router_mac}f0000001020408004500001c00004000fe0121b4${fip_ip}${dst_ip}${data}"
 >> expected
 
 AT_CHECK([as hv2 ovs-ofctl dump-flows br-int table=37 | grep pkt_mark=0x64 | 
grep -q n_packets=1],[0])
@@ -23049,6 +23051,11 @@ grep table_id=10 | wc -l`])
 
 check_row_count MAC_Binding 1
 
+OVS_WAIT_UNTIL(
+    [test 1 = `as hv1 ovs-ofctl dump-flows br-int table=67 | grep 
dl_src=50:54:00:00:00:13 \
+| wc -l`]
+)
+
 AT_CHECK([ovn-sbctl --format=csv --bare --columns logical_port,ip,mac \
 list mac_binding], [0], [lr0-sw0
 10.0.0.30
-- 
2.35.3

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to