Hi Vlad,
I have 2 patches for ewg that were released to upstream kernel (see below)
The first was approved unofficially and I hope that the second one will also be 
approved.
Please add the patches (in attachments)  in kernel_patches/fixes under 
ofa-kernel.

thanks

Moni

-------- Original Message --------
Subject: [ofa-general] [PATCH 0/2] IB: Improve recovery from SM change events   
after takeover
Date: Sun, 18 May 2008 15:25:24 +0300
From: Moni Shoua <[EMAIL PROTECTED]>
To: Roland Dreier <[EMAIL PROTECTED]>
CC: Olga Stern <[EMAIL PROTECTED]>,     OpenFabrics General <[EMAIL 
PROTECTED]>,        Moni Levy <[EMAIL PROTECTED]>


The patches below improve the the recovery of the IPoIB driver from
a faulure of  the SM and taking over by another SM. The purpose was
to minimize the the time that 2 hosts with IPoIB stay remain disconnected
after SM takeover event. 

Here is an example that was viewed in our tests.
One IPoIB host (client) sends a stream of multicast packets to another IPoIB 
host (server).
SM takeover event takes place during traffic and as a result multicast info is 
flushed
and there is a need to rejoin by hosts. Without the patch there is a chance 
(which according to our experience
is a very big chance) that the request to rejoin will be to the old  SM and 
only after a retry join completes successfully.

Our tests for IP multicast and unicast traffic between 2 hosts show that 
without the patch there
is a period of time of  up to 5  seconds  that that communication is lost and 
with the
patch the time decreases  to less than a second.


_______________________________________________
general mailing list
[EMAIL PROTECTED]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

The purpose of this patch is to make the events that are related to SM change
(namely CLIENT_REREGISTER event and SM_CHANGE event) less disruptive.
When SM related events are handled, it is not necessary to flush unicast
info from device but only multicast info. This patch divides the events that are
handled by IPoIB to three categories; 0, 1 and 2 (when 2 does more than 1 and 1
does more than 0).
The main change is in __ipoib_ib_dev_flush(). Instead of flagging  to the function
about pkey_events we now use leveling. An event that requires "harder" flushing
calls this function with higher number for level. Besides the concept,
the actual change is that SM related events are not  flushing unicast info and
not bringing the device down but only refresh the multicast info in the background.

Signed-off-by: Moni Levy  <[EMAIL PROTECTED]>
Signed-off-by: Moni Shoua <[EMAIL PROTECTED]>

---

 drivers/infiniband/ulp/ipoib/ipoib.h       |    9 ++++---
 drivers/infiniband/ulp/ipoib/ipoib_ib.c    |   37 ++++++++++++++++++-----------
 drivers/infiniband/ulp/ipoib/ipoib_main.c  |    5 ++-
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |   19 +++++++-------
 4 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index ca126fc..8ed4dc0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -276,10 +276,11 @@ struct ipoib_dev_priv {
 
 	struct delayed_work pkey_poll_task;
 	struct delayed_work mcast_task;
-	struct work_struct flush_task;
+	struct work_struct flush_task0;
+	struct work_struct flush_task1;
+	struct work_struct flush_task2;
 	struct work_struct restart_task;
 	struct delayed_work ah_reap_task;
-	struct work_struct pkey_event_task;
 
 	struct ib_device *ca;
 	u8		  port;
@@ -427,7 +428,9 @@ void ipoib_flush_paths(struct net_device *dev);
 struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
 
 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
-void ipoib_ib_dev_flush(struct work_struct *work);
+void ipoib_ib_dev_flush0(struct work_struct *work);
+void ipoib_ib_dev_flush1(struct work_struct *work);
+void ipoib_ib_dev_flush2(struct work_struct *work);
 void ipoib_pkey_event(struct work_struct *work);
 void ipoib_ib_dev_cleanup(struct net_device *dev);
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index f429bce..2a9c058 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -898,12 +898,14 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 	return 0;
 }
 
-static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int level)
 {
 	struct ipoib_dev_priv *cpriv;
 	struct net_device *dev = priv->dev;
 	u16 new_index;
 
+	ipoib_dbg(priv, "Try flushing level %d\n", level);
+
 	mutex_lock(&priv->vlan_mutex);
 
 	/*
@@ -911,7 +913,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 	 * the parent is down.
 	 */
 	list_for_each_entry(cpriv, &priv->child_intfs, list)
-		__ipoib_ib_dev_flush(cpriv, pkey_event);
+		__ipoib_ib_dev_flush(cpriv, level);
 
 	mutex_unlock(&priv->vlan_mutex);
 
@@ -925,7 +927,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		return;
 	}
 
-	if (pkey_event) {
+	if (level == 2) {
 		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
 			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 			ipoib_ib_dev_down(dev, 0);
@@ -943,11 +945,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		priv->pkey_index = new_index;
 	}
 
-	ipoib_dbg(priv, "flushing\n");
 
-	ipoib_ib_dev_down(dev, 0);
+	ipoib_mcast_dev_flush(dev);
+
+	if (level >= 1)
+		ipoib_ib_dev_down(dev, 0);
 
-	if (pkey_event) {
+	if (level >= 2) {
 		ipoib_ib_dev_stop(dev, 0);
 		ipoib_ib_dev_open(dev);
 	}
@@ -957,29 +961,36 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 	 * we get here, don't bring it back up if it's not configured up
 	 */
 	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
-		ipoib_ib_dev_up(dev);
+		if (level >= 1)
+			ipoib_ib_dev_up(dev);
 		ipoib_mcast_restart_task(&priv->restart_task);
 	}
 }
 
-void ipoib_ib_dev_flush(struct work_struct *work)
+void ipoib_ib_dev_flush0(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, flush_task);
+		container_of(work, struct ipoib_dev_priv, flush_task0);
 
-	ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
 	__ipoib_ib_dev_flush(priv, 0);
 }
 
-void ipoib_pkey_event(struct work_struct *work)
+void ipoib_ib_dev_flush1(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, pkey_event_task);
+		container_of(work, struct ipoib_dev_priv, flush_task1);
 
-	ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
 	__ipoib_ib_dev_flush(priv, 1);
 }
 
+void ipoib_ib_dev_flush2(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_task2);
+
+	__ipoib_ib_dev_flush(priv, 2);
+}
+
 void ipoib_ib_dev_cleanup(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 2442090..2808023 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -989,9 +989,10 @@ static void ipoib_setup(struct net_device *dev)
 	INIT_LIST_HEAD(&priv->multicast_list);
 
 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
-	INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
 	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
-	INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);
+	INIT_WORK(&priv->flush_task0,   ipoib_ib_dev_flush0);
+	INIT_WORK(&priv->flush_task1,   ipoib_ib_dev_flush1);
+	INIT_WORK(&priv->flush_task2,   ipoib_ib_dev_flush2);
 	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
 	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 8766d29..80c0409 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -289,15 +289,16 @@ void ipoib_event(struct ib_event_handler *handler,
 	if (record->element.port_num != priv->port)
 		return;
 
-	if (record->event == IB_EVENT_PORT_ERR    ||
-	    record->event == IB_EVENT_PORT_ACTIVE ||
-	    record->event == IB_EVENT_LID_CHANGE  ||
-	    record->event == IB_EVENT_SM_CHANGE   ||
-	    record->event == IB_EVENT_CLIENT_REREGISTER) {
-		ipoib_dbg(priv, "Port state change event\n");
-		queue_work(ipoib_workqueue, &priv->flush_task);
+	ipoib_dbg(priv, "Event %d on device %s port %d\n",record->event,
+			record->device->name, record->element.port_num);
+	if ( record->event == IB_EVENT_SM_CHANGE   ||
+	     record->event == IB_EVENT_CLIENT_REREGISTER) {
+		queue_work(ipoib_workqueue, &priv->flush_task0);
+	} else if (record->event == IB_EVENT_PORT_ERR ||
+		   record->event == IB_EVENT_PORT_ACTIVE ||
+		   record->event == IB_EVENT_LID_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_task1);
 	} else if (record->event == IB_EVENT_PKEY_CHANGE) {
-		ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);
-		queue_work(ipoib_workqueue, &priv->pkey_event_task);
+		queue_work(ipoib_workqueue, &priv->flush_task2);
 	}
 }
The purpose of this patch is to make the events that are related to SM change
(namely CLIENT_REREGISTER event and SM_CHANGE event) less disruptive.
When SM related events are handled, it is not necessary to flush unicast
info from device but only multicast info. This patch divides the events that are
handled by IPoIB to three categories; 0, 1 and 2 (when 2 does more than 1 and 1
does more than 0).
The main change is in __ipoib_ib_dev_flush(). Instead of flagging  to the function
about pkey_events we now use leveling. An event that requires "harder" flushing
calls this function with higher number for level. Besides the concept,
the actual change is that SM related events are not  flushing unicast info and
not bringing the device down but only refresh the multicast info in the background.

Signed-off-by: Moni Levy  <[EMAIL PROTECTED]>
Signed-off-by: Moni Shoua <[EMAIL PROTECTED]>

---

 drivers/infiniband/ulp/ipoib/ipoib.h       |    9 ++++---
 drivers/infiniband/ulp/ipoib/ipoib_ib.c    |   37 ++++++++++++++++++-----------
 drivers/infiniband/ulp/ipoib/ipoib_main.c  |    5 ++-
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |   19 +++++++-------
 4 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index ca126fc..8ed4dc0 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -276,10 +276,11 @@ struct ipoib_dev_priv {
 
 	struct delayed_work pkey_poll_task;
 	struct delayed_work mcast_task;
-	struct work_struct flush_task;
+	struct work_struct flush_task0;
+	struct work_struct flush_task1;
+	struct work_struct flush_task2;
 	struct work_struct restart_task;
 	struct delayed_work ah_reap_task;
-	struct work_struct pkey_event_task;
 
 	struct ib_device *ca;
 	u8		  port;
@@ -427,7 +428,9 @@ void ipoib_flush_paths(struct net_device *dev);
 struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
 
 int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
-void ipoib_ib_dev_flush(struct work_struct *work);
+void ipoib_ib_dev_flush0(struct work_struct *work);
+void ipoib_ib_dev_flush1(struct work_struct *work);
+void ipoib_ib_dev_flush2(struct work_struct *work);
 void ipoib_pkey_event(struct work_struct *work);
 void ipoib_ib_dev_cleanup(struct net_device *dev);
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index f429bce..2a9c058 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -898,12 +898,14 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 	return 0;
 }
 
-static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int level)
 {
 	struct ipoib_dev_priv *cpriv;
 	struct net_device *dev = priv->dev;
 	u16 new_index;
 
+	ipoib_dbg(priv, "Try flushing level %d\n", level);
+
 	mutex_lock(&priv->vlan_mutex);
 
 	/*
@@ -911,7 +913,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 	 * the parent is down.
 	 */
 	list_for_each_entry(cpriv, &priv->child_intfs, list)
-		__ipoib_ib_dev_flush(cpriv, pkey_event);
+		__ipoib_ib_dev_flush(cpriv, level);
 
 	mutex_unlock(&priv->vlan_mutex);
 
@@ -925,7 +927,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		return;
 	}
 
-	if (pkey_event) {
+	if (level == 2) {
 		if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) {
 			clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 			ipoib_ib_dev_down(dev, 0);
@@ -943,11 +945,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 		priv->pkey_index = new_index;
 	}
 
-	ipoib_dbg(priv, "flushing\n");
 
-	ipoib_ib_dev_down(dev, 0);
+	ipoib_mcast_dev_flush(dev);
+
+	if (level >= 1)
+		ipoib_ib_dev_down(dev, 0);
 
-	if (pkey_event) {
+	if (level >= 2) {
 		ipoib_ib_dev_stop(dev, 0);
 		ipoib_ib_dev_open(dev);
 	}
@@ -957,29 +961,36 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, int pkey_event)
 	 * we get here, don't bring it back up if it's not configured up
 	 */
 	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
-		ipoib_ib_dev_up(dev);
+		if (level >= 1)
+			ipoib_ib_dev_up(dev);
 		ipoib_mcast_restart_task(&priv->restart_task);
 	}
 }
 
-void ipoib_ib_dev_flush(struct work_struct *work)
+void ipoib_ib_dev_flush0(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, flush_task);
+		container_of(work, struct ipoib_dev_priv, flush_task0);
 
-	ipoib_dbg(priv, "Flushing %s\n", priv->dev->name);
 	__ipoib_ib_dev_flush(priv, 0);
 }
 
-void ipoib_pkey_event(struct work_struct *work)
+void ipoib_ib_dev_flush1(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv =
-		container_of(work, struct ipoib_dev_priv, pkey_event_task);
+		container_of(work, struct ipoib_dev_priv, flush_task1);
 
-	ipoib_dbg(priv, "Flushing %s and restarting its QP\n", priv->dev->name);
 	__ipoib_ib_dev_flush(priv, 1);
 }
 
+void ipoib_ib_dev_flush2(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_task2);
+
+	__ipoib_ib_dev_flush(priv, 2);
+}
+
 void ipoib_ib_dev_cleanup(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 2442090..2808023 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -989,9 +989,10 @@ static void ipoib_setup(struct net_device *dev)
 	INIT_LIST_HEAD(&priv->multicast_list);
 
 	INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
-	INIT_WORK(&priv->pkey_event_task, ipoib_pkey_event);
 	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
-	INIT_WORK(&priv->flush_task,   ipoib_ib_dev_flush);
+	INIT_WORK(&priv->flush_task0,   ipoib_ib_dev_flush0);
+	INIT_WORK(&priv->flush_task1,   ipoib_ib_dev_flush1);
+	INIT_WORK(&priv->flush_task2,   ipoib_ib_dev_flush2);
 	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
 	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 8766d29..80c0409 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -289,15 +289,16 @@ void ipoib_event(struct ib_event_handler *handler,
 	if (record->element.port_num != priv->port)
 		return;
 
-	if (record->event == IB_EVENT_PORT_ERR    ||
-	    record->event == IB_EVENT_PORT_ACTIVE ||
-	    record->event == IB_EVENT_LID_CHANGE  ||
-	    record->event == IB_EVENT_SM_CHANGE   ||
-	    record->event == IB_EVENT_CLIENT_REREGISTER) {
-		ipoib_dbg(priv, "Port state change event\n");
-		queue_work(ipoib_workqueue, &priv->flush_task);
+	ipoib_dbg(priv, "Event %d on device %s port %d\n",record->event,
+			record->device->name, record->element.port_num);
+	if ( record->event == IB_EVENT_SM_CHANGE   ||
+	     record->event == IB_EVENT_CLIENT_REREGISTER) {
+		queue_work(ipoib_workqueue, &priv->flush_task0);
+	} else if (record->event == IB_EVENT_PORT_ERR ||
+		   record->event == IB_EVENT_PORT_ACTIVE ||
+		   record->event == IB_EVENT_LID_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_task1);
 	} else if (record->event == IB_EVENT_PKEY_CHANGE) {
-		ipoib_dbg(priv, "P_Key change event on port:%d\n", priv->port);
-		queue_work(ipoib_workqueue, &priv->pkey_event_task);
+		queue_work(ipoib_workqueue, &priv->flush_task2);
 	}
 }
_______________________________________________
ewg mailing list
ewg@lists.openfabrics.org
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg

Reply via email to