2/2: Fix packet drops on LACP bond after link up
Problem:
On certain Fortville NICs it has been observed that PHY UP detection can
get delayed (sometimes up to 4-5 secs). When the driver fails to fetch
PHY status as UP even though its actually UP, LACP packets can get
exchanged and LACP slave brought UP. In such a case, the remote end
would start sending traffic on that slave, but OVS drops it, as the
bond-slave is not yet enabled due to PHY DOWN.
Fix:
The main intention here is delay LACP negotiation until carrier (PHY)
status is read as UP.
1. In port_run()/bundle_run(), cache the carrier status in
"struct ofport_dpif"/"struct bond_slave".
2. When carrier state is DOWN, do not send any LACPDUs and
drop any received LACPDUs.
3. When LACP state changes or LACPDU is received, trigger re-checking
of carrier-state (in port_run()) by incrementing the connectivity
sequence number to find out the true carrier state as fast as
possible.
Signed-off-by: Manohar K C
CC: Jan Scheurich
CC: Nitin Katiyar
---
v1 1/2: Evaluate lacp may_enable inline in the datapath thread and
check for pending "enable" in the bond admissibility check to avoid
drops.
lib/lacp.c | 51 +---
lib/lacp.h | 6 +++---
ofproto/bond.c | 30 ++
ofproto/bond.h | 3 +++
ofproto/ofproto-dpif-xlate.c | 4 +++-
ofproto/ofproto-dpif.c | 11 +++---
tests/ofproto-dpif.at| 3 +++
7 files changed, 93 insertions(+), 15 deletions(-)
diff --git a/lib/lacp.c b/lib/lacp.c
index 8353746..88621bc 100644
--- a/lib/lacp.c
+++ b/lib/lacp.c
@@ -33,6 +33,7 @@
#include "unixctl.h"
#include "openvswitch/vlog.h"
#include "util.h"
+#include "ofproto/bond.h"
VLOG_DEFINE_THIS_MODULE(lacp);
@@ -148,7 +149,7 @@ static void slave_get_actor(struct slave *, struct
lacp_info *actor)
OVS_REQUIRES(mutex);
static void slave_get_priority(struct slave *, struct lacp_info *priority)
OVS_REQUIRES(mutex);
-static bool slave_may_tx(const struct slave *)
+static bool slave_may_tx(const void *bond, const struct slave *)
OVS_REQUIRES(mutex);
static struct slave *slave_lookup(const struct lacp *, const void *slave)
OVS_REQUIRES(mutex);
@@ -325,7 +326,7 @@ lacp_is_active(const struct lacp *lacp) OVS_EXCLUDED(mutex)
* called on all packets received on 'slave_' with Ethernet Type ETH_TYPE_LACP.
*/
void
-lacp_process_packet(struct lacp *lacp, const void *slave_,
+lacp_process_packet(struct lacp *lacp, const void *bond, const void *slave_,
const struct dp_packet *packet)
OVS_EXCLUDED(mutex)
{
@@ -333,6 +334,7 @@ lacp_process_packet(struct lacp *lacp, const void *slave_,
const struct lacp_pdu *pdu;
long long int tx_rate;
struct slave *slave;
+bool carrier_up = false;
lacp_lock();
slave = slave_lookup(lacp, slave_);
@@ -348,6 +350,21 @@ lacp_process_packet(struct lacp *lacp, const void *slave_,
goto out;
}
+/*
+ * On some NICs L1 state reporting is slow. In case LACP packets are
+ * received while carrier (L1) state is still down, drop the LACPDU and
+ * trigger re-checking of L1 state.
+ */
+carrier_up = bond_slave_get_carrier(bond, slave_);
+if (!carrier_up) {
+seq_change(connectivity_seq_get());
+
+VLOG_INFO("carrier still DOWN - conn seq changed for slave %s, "
+ "dropping packet\n", slave->name);
+
+goto out;
+}
+
slave->status = LACP_CURRENT;
tx_rate = lacp->fast ? LACP_FAST_TIME_TX : LACP_SLOW_TIME_TX;
timer_set_duration(&slave->rx, LACP_RX_MULTIPLIER * tx_rate);
@@ -521,7 +538,8 @@ lacp_slave_is_current(const struct lacp *lacp, const void
*slave_)
/* This function should be called periodically to update 'lacp'. */
void
-lacp_run(struct lacp *lacp, lacp_send_pdu *send_pdu) OVS_EXCLUDED(mutex)
+lacp_run(struct lacp *lacp, const void *bond,
+ lacp_send_pdu *send_pdu) OVS_EXCLUDED(mutex)
{
struct slave *slave;
@@ -551,7 +569,7 @@ lacp_run(struct lacp *lacp, lacp_send_pdu *send_pdu)
OVS_EXCLUDED(mutex)
HMAP_FOR_EACH (slave, node, &lacp->slaves) {
struct lacp_info actor;
-if (!slave_may_tx(slave)) {
+if (!slave_may_tx(bond, slave)) {
continue;
}
@@ -580,13 +598,13 @@ lacp_run(struct lacp *lacp, lacp_send_pdu *send_pdu)
OVS_EXCLUDED(mutex)
/* Causes poll_block() to wake up when lacp_run() needs to be called again. */
void
-lacp_wait(struct lacp *lacp) OVS_EXCLUDED(mutex)
+lacp_wait(struct lacp *lacp, const void *bond) OVS_EXCLUDED(mutex)
{
struct slave *slave;
lacp_lock();
HMAP_FOR_EACH (slave, node, &lacp->slaves) {
-if (slave_may_tx(slave)) {
+if (slave_may_tx(bond, slave)) {
timer_wait(&slave->tx);
}
@@ -810,9 +828,26 @@ slave_get_priority(struct slave *slave, struct lacp_info
*prio