Re: [ovs-dev] [PATCH v1 2/2] Fix packet drops on LACP bond after link up

2018-06-05 Thread Manohar Krishnappa Chidambaraswamy
Hi,

Could someone take a look at this patch? We hit this issue (drops) on a 
customer setup and appreciate your comments/suggestions.

Thanx
Manu

On 17/05/18, 3:29 PM, "ovs-dev-boun...@openvswitch.org on behalf of Manohar 
Krishnappa Chidambaraswamy"  wrote:

2/2: Fix packet drops on LACP bond after link up

Problem:

On certain Fortville NICs it has been observed that PHY UP detection can
get delayed (sometimes up to 4-5 secs). When the driver fails to fetch
PHY status as UP even though its actually UP,  LACP packets can get
exchanged and LACP slave brought UP. In such a case, the remote end
would start sending traffic on that slave, but OVS drops it, as the
bond-slave is not yet enabled due to PHY DOWN.

Fix:

The main intention here is delay LACP negotiation until carrier (PHY)
status is read as UP.

1. In port_run()/bundle_run(), cache the carrier status in
"struct ofport_dpif"/"struct bond_slave".

2. When carrier state is DOWN, do not send any LACPDUs and
 drop any received LACPDUs.

3. When LACP state changes or LACPDU is received, trigger re-checking
  of carrier-state (in port_run()) by incrementing the connectivity
  sequence number to find out the true carrier state as fast as
  possible.

Signed-off-by: Manohar K C

CC: Jan Scheurich 
CC: Nitin Katiyar 
---

v1 1/2: Evaluate lacp may_enable inline in the datapath thread and
check for pending "enable" in the bond admissibility check to avoid
drops.

 lib/lacp.c   | 51 
+---
 lib/lacp.h   |  6 +++---
 ofproto/bond.c   | 30 ++
 ofproto/bond.h   |  3 +++
 ofproto/ofproto-dpif-xlate.c |  4 +++-
 ofproto/ofproto-dpif.c   | 11 +++---
 tests/ofproto-dpif.at|  3 +++
 7 files changed, 93 insertions(+), 15 deletions(-)

diff --git a/lib/lacp.c b/lib/lacp.c
index 8353746..88621bc 100644
--- a/lib/lacp.c
+++ b/lib/lacp.c
@@ -33,6 +33,7 @@
 #include "unixctl.h"
 #include "openvswitch/vlog.h"
 #include "util.h"
+#include "ofproto/bond.h"
 
 VLOG_DEFINE_THIS_MODULE(lacp);
 
@@ -148,7 +149,7 @@ static void slave_get_actor(struct slave *, struct 
lacp_info *actor)
 OVS_REQUIRES(mutex);
 static void slave_get_priority(struct slave *, struct lacp_info *priority)
 OVS_REQUIRES(mutex);
-static bool slave_may_tx(const struct slave *)
+static bool slave_may_tx(const void *bond, const struct slave *)
 OVS_REQUIRES(mutex);
 static struct slave *slave_lookup(const struct lacp *, const void *slave)
 OVS_REQUIRES(mutex);
@@ -325,7 +326,7 @@ lacp_is_active(const struct lacp *lacp) 
OVS_EXCLUDED(mutex)
  * called on all packets received on 'slave_' with Ethernet Type 
ETH_TYPE_LACP.
  */
 void
-lacp_process_packet(struct lacp *lacp, const void *slave_,
+lacp_process_packet(struct lacp *lacp, const void *bond, const void 
*slave_,
 const struct dp_packet *packet)
 OVS_EXCLUDED(mutex)
 {
@@ -333,6 +334,7 @@ lacp_process_packet(struct lacp *lacp, const void 
*slave_,
 const struct lacp_pdu *pdu;
 long long int tx_rate;
 struct slave *slave;
+bool carrier_up = false;
 
 lacp_lock();
 slave = slave_lookup(lacp, slave_);
@@ -348,6 +350,21 @@ lacp_process_packet(struct lacp *lacp, const void 
*slave_,
 goto out;
 }
 
+/*
+ * On some NICs L1 state reporting is slow. In case LACP packets are
+ * received while carrier (L1) state is still down, drop the LACPDU and
+ * trigger re-checking of L1 state.
+ */
+carrier_up = bond_slave_get_carrier(bond, slave_);
+if (!carrier_up) {
+seq_change(connectivity_seq_get());
+
+VLOG_INFO("carrier still DOWN - conn seq changed for slave %s, "
+  "dropping packet\n", slave->name);
+
+goto out;
+}
+
 slave->status = LACP_CURRENT;
 tx_rate = lacp->fast ? LACP_FAST_TIME_TX : LACP_SLOW_TIME_TX;
 timer_set_duration(&slave->rx, LACP_RX_MULTIPLIER * tx_rate);
@@ -521,7 +538,8 @@ lacp_slave_is_current(const struct lacp *lacp, const 
void *slave_)
 
 /* This function should be called periodically to update 'lacp'. */
 void
-lacp_run(struct lacp *lacp, lacp_send_pdu *send_pdu) OVS_EXCLUDED(mutex)
+lacp_run(struct lacp *lacp, const void *bond,
+ lacp_send_pdu *send_pdu) OVS_EXCLUDED(mutex)
 {
 struct slave *slave;
 
@@ -551,7 +569,7 @@ lacp_run(struct lacp *lacp, lacp_send_pdu *send_pdu) 
OVS_EXCLUDED(mutex)
 HMAP_FOR_EACH (slave, node, &lacp->slaves) {
 struct lac

[ovs-dev] [PATCH v1 2/2] Fix packet drops on LACP bond after link up

2018-05-17 Thread Manohar Krishnappa Chidambaraswamy
2/2: Fix packet drops on LACP bond after link up

Problem:

On certain Fortville NICs it has been observed that PHY UP detection can
get delayed (sometimes up to 4-5 secs). When the driver fails to fetch
PHY status as UP even though its actually UP,  LACP packets can get
exchanged and LACP slave brought UP. In such a case, the remote end
would start sending traffic on that slave, but OVS drops it, as the
bond-slave is not yet enabled due to PHY DOWN.

Fix:

The main intention here is delay LACP negotiation until carrier (PHY)
status is read as UP.

1. In port_run()/bundle_run(), cache the carrier status in
"struct ofport_dpif"/"struct bond_slave".

2. When carrier state is DOWN, do not send any LACPDUs and
 drop any received LACPDUs.

3. When LACP state changes or LACPDU is received, trigger re-checking
  of carrier-state (in port_run()) by incrementing the connectivity
  sequence number to find out the true carrier state as fast as
  possible.

Signed-off-by: Manohar K C

CC: Jan Scheurich 
CC: Nitin Katiyar 
---

v1 1/2: Evaluate lacp may_enable inline in the datapath thread and
check for pending "enable" in the bond admissibility check to avoid
drops.

 lib/lacp.c   | 51 +---
 lib/lacp.h   |  6 +++---
 ofproto/bond.c   | 30 ++
 ofproto/bond.h   |  3 +++
 ofproto/ofproto-dpif-xlate.c |  4 +++-
 ofproto/ofproto-dpif.c   | 11 +++---
 tests/ofproto-dpif.at|  3 +++
 7 files changed, 93 insertions(+), 15 deletions(-)

diff --git a/lib/lacp.c b/lib/lacp.c
index 8353746..88621bc 100644
--- a/lib/lacp.c
+++ b/lib/lacp.c
@@ -33,6 +33,7 @@
 #include "unixctl.h"
 #include "openvswitch/vlog.h"
 #include "util.h"
+#include "ofproto/bond.h"
 
 VLOG_DEFINE_THIS_MODULE(lacp);
 
@@ -148,7 +149,7 @@ static void slave_get_actor(struct slave *, struct 
lacp_info *actor)
 OVS_REQUIRES(mutex);
 static void slave_get_priority(struct slave *, struct lacp_info *priority)
 OVS_REQUIRES(mutex);
-static bool slave_may_tx(const struct slave *)
+static bool slave_may_tx(const void *bond, const struct slave *)
 OVS_REQUIRES(mutex);
 static struct slave *slave_lookup(const struct lacp *, const void *slave)
 OVS_REQUIRES(mutex);
@@ -325,7 +326,7 @@ lacp_is_active(const struct lacp *lacp) OVS_EXCLUDED(mutex)
  * called on all packets received on 'slave_' with Ethernet Type ETH_TYPE_LACP.
  */
 void
-lacp_process_packet(struct lacp *lacp, const void *slave_,
+lacp_process_packet(struct lacp *lacp, const void *bond, const void *slave_,
 const struct dp_packet *packet)
 OVS_EXCLUDED(mutex)
 {
@@ -333,6 +334,7 @@ lacp_process_packet(struct lacp *lacp, const void *slave_,
 const struct lacp_pdu *pdu;
 long long int tx_rate;
 struct slave *slave;
+bool carrier_up = false;
 
 lacp_lock();
 slave = slave_lookup(lacp, slave_);
@@ -348,6 +350,21 @@ lacp_process_packet(struct lacp *lacp, const void *slave_,
 goto out;
 }
 
+/*
+ * On some NICs L1 state reporting is slow. In case LACP packets are
+ * received while carrier (L1) state is still down, drop the LACPDU and
+ * trigger re-checking of L1 state.
+ */
+carrier_up = bond_slave_get_carrier(bond, slave_);
+if (!carrier_up) {
+seq_change(connectivity_seq_get());
+
+VLOG_INFO("carrier still DOWN - conn seq changed for slave %s, "
+  "dropping packet\n", slave->name);
+
+goto out;
+}
+
 slave->status = LACP_CURRENT;
 tx_rate = lacp->fast ? LACP_FAST_TIME_TX : LACP_SLOW_TIME_TX;
 timer_set_duration(&slave->rx, LACP_RX_MULTIPLIER * tx_rate);
@@ -521,7 +538,8 @@ lacp_slave_is_current(const struct lacp *lacp, const void 
*slave_)
 
 /* This function should be called periodically to update 'lacp'. */
 void
-lacp_run(struct lacp *lacp, lacp_send_pdu *send_pdu) OVS_EXCLUDED(mutex)
+lacp_run(struct lacp *lacp, const void *bond,
+ lacp_send_pdu *send_pdu) OVS_EXCLUDED(mutex)
 {
 struct slave *slave;
 
@@ -551,7 +569,7 @@ lacp_run(struct lacp *lacp, lacp_send_pdu *send_pdu) 
OVS_EXCLUDED(mutex)
 HMAP_FOR_EACH (slave, node, &lacp->slaves) {
 struct lacp_info actor;
 
-if (!slave_may_tx(slave)) {
+if (!slave_may_tx(bond, slave)) {
 continue;
 }
 
@@ -580,13 +598,13 @@ lacp_run(struct lacp *lacp, lacp_send_pdu *send_pdu) 
OVS_EXCLUDED(mutex)
 
 /* Causes poll_block() to wake up when lacp_run() needs to be called again. */
 void
-lacp_wait(struct lacp *lacp) OVS_EXCLUDED(mutex)
+lacp_wait(struct lacp *lacp, const void *bond) OVS_EXCLUDED(mutex)
 {
 struct slave *slave;
 
 lacp_lock();
 HMAP_FOR_EACH (slave, node, &lacp->slaves) {
-if (slave_may_tx(slave)) {
+if (slave_may_tx(bond, slave)) {
 timer_wait(&slave->tx);
 }
 
@@ -810,9 +828,26 @@ slave_get_priority(struct slave *slave, struct lacp_info 
*prio