[PATCH 1/2] ib/ipoib: allow disabling/enabling TSO through ethtool

2010-03-04 Thread Or Gerlitz
allow disabling/enabling TSO on the fly by ethtool

Signed-off-by: Or Gerlitz ogerl...@voltaire.com

---
 drivers/infiniband/ulp/ipoib/ipoib_ethtool.c |   19 +++
 1 file changed, 19 insertions(+)

Index: linux-2.6.33/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
===
--- linux-2.6.33.orig/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ linux-2.6.33/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -49,6 +49,24 @@ static u32 ipoib_get_rx_csum(struct net_
!test_bit(IPOIB_FLAG_ADMIN_CM, priv-flags);
 }

+static int ipoib_set_tso(struct net_device *dev, u32 data)
+{
+   struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+   if (data) {
+   if (!test_bit(IPOIB_FLAG_ADMIN_CM, priv-flags) 
+   (dev-features  NETIF_F_SG) 
+   (priv-hca_caps  IB_DEVICE_UD_TSO))
+   dev-features |= NETIF_F_TSO;
+   else {
+   ipoib_warn(priv, can't set TSO on\n);
+   return -EOPNOTSUPP;
+   }
+   } else
+   dev-features = ~NETIF_F_TSO;
+   return 0;
+}
+
 static int ipoib_get_coalesce(struct net_device *dev,
  struct ethtool_coalesce *coal)
 {
@@ -131,6 +149,7 @@ static void ipoib_get_ethtool_stats(stru
 static const struct ethtool_ops ipoib_ethtool_ops = {
.get_drvinfo= ipoib_get_drvinfo,
.get_rx_csum= ipoib_get_rx_csum,
+   .set_tso= ipoib_set_tso,
.get_coalesce   = ipoib_get_coalesce,
.set_coalesce   = ipoib_set_coalesce,
.get_flags  = ethtool_op_get_flags,
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] ib/ipoib: include err code in trace message for ib_post_send() failures

2010-03-04 Thread Or Gerlitz
print the return code of ib_post_send() if it fails
to help debug errors

Signed-off-by: Or Gerlitz ogerl...@voltaire.com

---
 drivers/infiniband/ulp/ipoib/ipoib_cm.c |8 +---
 drivers/infiniband/ulp/ipoib/ipoib_ib.c |9 +
 2 files changed, 10 insertions(+), 7 deletions(-)

Index: a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -529,7 +529,7 @@ void ipoib_send(struct net_device *dev,
 {
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_tx_buf *tx_req;
-   int hlen;
+   int hlen, rc;
void *phead;

if (skb_is_gso(skb)) {
@@ -585,9 +585,10 @@ void ipoib_send(struct net_device *dev,
netif_stop_queue(dev);
}

-   if (unlikely(post_send(priv, priv-tx_head  (ipoib_sendq_size - 1),
-  address-ah, qpn, tx_req, phead, hlen))) {
-   ipoib_warn(priv, post_send failed\n);
+   rc = post_send(priv, priv-tx_head  (ipoib_sendq_size - 1),
+   address-ah, qpn, tx_req, phead, hlen);
+   if (unlikely(rc)) {
+   ipoib_warn(priv, post_send failed, error %d\n, rc);
++dev-stats.tx_errors;
--priv-tx_outstanding;
ipoib_dma_unmap_tx(priv-ca, tx_req);
Index: a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
===
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -708,6 +708,7 @@ void ipoib_cm_send(struct net_device *de
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_cm_tx_buf *tx_req;
u64 addr;
+   int rc;

if (unlikely(skb-len  tx-mtu)) {
ipoib_warn(priv, packet len %d ( %d) too long to send, 
dropping\n,
@@ -739,9 +740,10 @@ void ipoib_cm_send(struct net_device *de

tx_req-mapping = addr;

-   if (unlikely(post_send(priv, tx, tx-tx_head  (ipoib_sendq_size - 1),
-  addr, skb-len))) {
-   ipoib_warn(priv, post_send failed\n);
+   rc = post_send(priv, tx, tx-tx_head  (ipoib_sendq_size - 1),
+   addr, skb-len);
+   if (unlikely(rc)) {
+   ipoib_warn(priv, post_send failed, error %d\n, rc);
++dev-stats.tx_errors;
ib_dma_unmap_single(priv-ca, addr, skb-len, DMA_TO_DEVICE);
dev_kfree_skb_any(skb);
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] ib/ipoib: allow disabling/enabling TSO through ethtool

2010-03-04 Thread Eli Cohen
On Thu, Mar 04, 2010 at 03:16:52PM +0200, Or Gerlitz wrote:
 
 +static int ipoib_set_tso(struct net_device *dev, u32 data)
 +{
 + struct ipoib_dev_priv *priv = netdev_priv(dev);
 +
 + if (data) {
 + if (!test_bit(IPOIB_FLAG_ADMIN_CM, priv-flags) 
 + (dev-features  NETIF_F_SG) 
 + (priv-hca_caps  IB_DEVICE_UD_TSO))
 + dev-features |= NETIF_F_TSO;
 + else {
 + ipoib_warn(priv, can't set TSO on\n);
 + return -EOPNOTSUPP;
 + }
 + } else
 + dev-features = ~NETIF_F_TSO;
 + return 0;
 +}
 +

I believe dev-features should be protected by rtnl lock.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ewg] nfsrdma fails to write big file,

2010-03-04 Thread Mahesh Siddheshwar

Tom Tucker wrote:

Mahesh Siddheshwar wrote:

Hi Tom, Vu,

Tom Tucker wrote:

Roland Dreier wrote:
  +   /*   +* Add room for frmr 
register and invalidate WRs

  +* Requests sometimes have two chunks, each chunk
  +* requires to have different frmr. The safest
  +* WRs required are max_send_wr * 6; however, we
  +* get send completions and poll fast enough, it
  +* is pretty safe to have max_send_wr * 4.   
+*/

  +   ep-rep_attr.cap.max_send_wr *= 4;

Seems like a bad design if there is a possibility of work queue
overflow; if you're counting on events occurring in a particular order
or completions being handled fast enough, then your design is 
going to
fail in some high load situations, which I don't think you want.   


Vu,

Would you please try the following:

- Set the multiplier to 5

While trying to test this between a Linux client and Solaris server,
I made the following changes in :
/usr/src/ofa_kernel-1.5.1/net/sunrpc/xprtrdma/verbs.c

diff verbs.c.org verbs.c
653c653
   ep-rep_attr.cap.max_send_wr *= 3;
---
   ep-rep_attr.cap.max_send_wr *= 8;
685c685
   ep-rep_cqinit = ep-rep_attr.cap.max_send_wr/2 /*  - 1*/;
---
   ep-rep_cqinit = ep-rep_attr.cap.max

(I bumped it to 8)

did make install.
On reboot I see the errors on NFS READs as opposed to WRITEs
as seen before, when I try to read a 10G file from the server.

The client is running: RHEL 5.3 (2.6.18-128.el5PAE) with
OFED-1.5.1-20100223-0740 bits. The client has an Sun IB
HCA: SUN0070130001, MT25418, 2.7.0 firmware, hw_rev = a0.
The server is running Solaris based on snv_128.

rpcdebug output from the client:

==
RPC:85 call_bind (status 0)
RPC:85 call_connect xprt ec78d800 is connected
RPC:85 call_transmit (status 0)
RPC:85 xprt_prepare_transmit
RPC:85 xprt_cwnd_limited cong = 0 cwnd = 8192
RPC:85 rpc_xdr_encode (status 0)
RPC:85 marshaling UNIX cred eddb4dc0
RPC:85 using AUTH_UNIX cred eddb4dc0 to wrap rpc data
RPC:85 xprt_transmit(164)
RPC:   rpcrdma_inline_pullup: pad 0 destp 0xf1dd1410 len 164 
hdrlen 164
RPC:   rpcrdma_register_frmr_external: Using frmr ec7da920 to map 
4 segments
RPC:   rpcrdma_create_chunks: write chunk elem 
16...@0x38536d000:0xa601 (more)
RPC:   rpcrdma_register_frmr_external: Using frmr ec7da960 to map 
1 segments
RPC:   rpcrdma_create_chunks: write chunk elem 
1...@0x31dd153c:0xaa01 (last)
RPC:   rpcrdma_marshal_req: write chunk: hdrlen 68 rpclen 164 
padlen 0 headerp 0xf1dd124c base 0xf1dd136c lkey 0x500

RPC:85 xmit complete
RPC:85 sleep_on(queue xprt_pending time 4683109)
RPC:85 added to queue ec78d994 xprt_pending
RPC:85 setting alarm for 6 ms
RPC:   wake_up_next(ec78d944 xprt_resend)
RPC:   wake_up_next(ec78d8f4 xprt_sending)
RPC:   rpcrdma_qp_async_error_upcall: QP error 3 on device mlx4_0 
ep ec78db40

RPC:85 __rpc_wake_up_task (now 4683110)
RPC:85 disabling timer
RPC:85 removed from queue ec78d994 xprt_pending
RPC:   __rpc_wake_up_task done
RPC:85 __rpc_execute flags=0x1
RPC:85 call_status (status -107)
RPC:85 call_bind (status 0)
RPC:85 call_connect xprt ec78d800 is not connected
RPC:85 xprt_connect xprt ec78d800 is not connected
RPC:85 sleep_on(queue xprt_pending time 4683110)
RPC:85 added to queue ec78d994 xprt_pending
RPC:85 setting alarm for 6 ms
RPC:   rpcrdma_event_process: event rep ec116800 status 5 opcode 
80 length 2493606

RPC:   rpcrdma_event_process: recv WC status 5, connection lost
RPC:   rpcrdma_conn_upcall: disconnected: ec78dbccI4:20049 (ep 
0xec78db40 event 0xa)

RPC:   rpcrdma_conn_upcall: disconnected
rpcrdma: connection to ec78dbccI4:20049 closed (-103)
RPC:   xprt_rdma_connect_worker: reconnect
==

On the server I see:

Mar  3 17:45:16 elena-ar hermon: [ID 271130 kern.notice] NOTICE: 
hermon0: Device Error: CQE remote access error
Mar  3 17:45:16 elena-ar nfssrv: [ID 819430 kern.notice] NOTICE: NFS: 
bad sendreply
Mar  3 17:45:21 elena-ar hermon: [ID 271130 kern.notice] NOTICE: 
hermon0: Device Error: CQE remote access error
Mar  3 17:45:21 elena-ar nfssrv: [ID 819430 kern.notice] NOTICE: NFS: 
bad sendreply


The remote access error is actually seen on RDMA_WRITE.
Doing some more debug on the server with DTrace, I see that
the destination address and length matches the write chunk
element in the Linux debug output above.


 0   9385  rib_write:entry daddr 38536d000, len 4000, 
hdl a601

 0   9358 rib_init_sendwait:return ff44a715d308
 1   9296   rib_svc_scq_handler:return 1f7
 1   9356  rib_sendwait:return 14
 1   9386 rib_write:return 14

^^^ that is RDMA_FAILED in
 1  63295xdrrdma_send_read_data:return 0
 1   5969  xdr_READ3res:return
 1   5969  xdr_READ3res:return 

RE: [PATCH] ib_mad: Ignore iWARP devices on device removal.

2010-03-04 Thread Sean Hefty
When you unload an iWARP device, the ib_mad module logs errors.
It should be ignoring iWARP devices on device removal just like it does
on device add.

Signed-off-by: Steve Wise sw...@opengridcomputing.com
Acked-by: Sean Hefty sean.he...@intel.com

---
Looks okay to me.  Thanks, Steve.


 drivers/infiniband/core/mad.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 7522008..6216187 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -2964,6 +2964,9 @@ static void ib_mad_remove_device(struct ib_device
*device)
 {
   int i, num_ports, cur_port;

+  if (rdma_node_get_transport(device-node_type) != RDMA_TRANSPORT_IB)
+  return;
+
   if (device-node_type == RDMA_NODE_IB_SWITCH) {
   num_ports = 1;
   cur_port = 0;


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 09/11] opensm: Make it possible to configure no fallback routing engine.

2010-03-04 Thread Jim Schutt

On Thu, 2010-03-04 at 07:35 -0700, Yevgeny Kliteynik wrote:
 Hi Jim,
 
 On 20/Nov/09 21:15, Jim Schutt wrote:
  For a fabric that requires routing with an engine with special properties,
  say avoiding credit loops via making use of SLs in routing, it might
  be preferable to not fall back to minhop if the configured routing engine
  fails.
 
  E.g. the torus-2QoS routing engine uses both SL2VL maps and path SL values
  to provide routing free of credit loops, but cannot route fabrics for
  some patterns of failed switches.  Should a switch fail that creates such
  a pattern, it may be preferable to keep the previous routing information
  loaded in the switches until a switch can be replaced that restores
  torus-2QoS's ability to route the fabric.
 
  The alternative, having some other engine route the fabric, will immediately
  introduce credit loops.
 
 This is a great idea.
 Regarding the implementation: I would prefer seeing this
 as a purely OpenSM option and not as a new routing engine
 keyword.
 I think it would be cleaner to leave the list of routing
 engines w/o special keys, and have a general option
 that would prevent SM from falling back. 

That seems right to me, now.

 Actually, the
 fall-back itself is not bad, as it is defined by the list
 of routing engines, and SM should try them one by one.
 The problem is with using default routing that is not
 specified in the routing engines list.

I agree.  If a user explicitly configures which
routing engines to try, only those should be used,
and a notification logged if they all fail.

 
 Here's the patch that implements OSM option
 use_default_routing, and a command line parameter
 no_default_routing to control this option.

This looks good to me.

 
 I'll write the patch that adds this option to the
 OSM trunk and send it to Sasha shortly.

OK, thanks.

-- Jim

 
 Signed-off-by: Yevgeny Kliteynik klit...@dev.mellanox.co.il
 ---
   opensm/include/opensm/osm_subnet.h |2 +-
   opensm/opensm/main.c   |9 +
   opensm/opensm/osm_opensm.c |   10 --
   opensm/opensm/osm_subnet.c |8 
   opensm/opensm/osm_ucast_mgr.c  |7 +--
   5 files changed, 27 insertions(+), 9 deletions(-)
 
 diff --git a/opensm/include/opensm/osm_subnet.h 
 b/opensm/include/opensm/osm_subnet.h
 index a4133a0..905f64d 100644
 --- a/opensm/include/opensm/osm_subnet.h
 +++ b/opensm/include/opensm/osm_subnet.h
 @@ -190,6 +190,7 @@ typedef struct osm_subn_opt {
   boolean_t sweep_on_trap;
   char *routing_engine_names;
   boolean_t use_ucast_cache;
 + boolean_t use_default_routing;
   boolean_t connect_roots;
   char *lid_matrix_dump_file;
   char *lfts_file;
 @@ -215,7 +216,6 @@ typedef struct osm_subn_opt {
   osm_qos_options_t qos_rtr_options;
   boolean_t enable_quirks;
   boolean_t no_clients_rereg;
 - boolean_t no_fallback_routing_engine;
   #ifdef ENABLE_OSM_PERF_MGR
   boolean_t perfmgr;
   boolean_t perfmgr_redir;
 diff --git a/opensm/opensm/main.c b/opensm/opensm/main.c
 index 096bf5f..47075a2 100644
 --- a/opensm/opensm/main.c
 +++ b/opensm/opensm/main.c
 @@ -175,6 +175,10 @@ static void show_usage(void)
separated by commas so that specific ordering of 
 routing\n
algorithms will be tried if earlier routing engines 
 fail.\n
Supported engines: updn, file, ftree, lash, dor, 
 torus-2QoS\n\n);
 + printf(--no_default_routing\n
 +  This option prevents OpenSM from falling back to 
 default\n
 +  routing if none of the provided engines was able to\n
 +  configure the subnet.\n\n);
   printf(--do_mesh_analysis\n
This option enables additional analysis for the 
 lash\n
routing engine to precondition switch port 
 assignments\n
 @@ -612,6 +616,7 @@ int main(int argc, char *argv[])
   {sm_sl, 1, NULL, 7},
   {retries, 1, NULL, 8},
   {torus_config, 1, NULL, 9},
 + {no_default_routing, 0, NULL, 10},
   {NULL, 0, NULL, 0}  /* Required at the end of the array */
   };
   
 @@ -993,6 +998,10 @@ int main(int argc, char *argv[])
   case 9:
   SET_STR_OPT(opt.torus_conf_file, optarg);
   break;
 + case 10:
 + opt.use_default_routing = FALSE;
 + printf( No fall back to default routing\n);
 + break;
   case 'h':
   case '?':
   case ':':
 diff --git a/opensm/opensm/osm_opensm.c b/opensm/opensm/osm_opensm.c
 index e7ef55c..d153be5 100644
 --- a/opensm/opensm/osm_opensm.c
 +++ b/opensm/opensm/osm_opensm.c
 @@ -159,11 +159,6 @@ static struct osm_routing_engine 
 *setup_routing_engine(osm_opensm_t *osm,
   struct osm_routing_engine *re;
   const struct