Re: [openib-general] [PATCH] Convert idr's internal locking to _irqsave variant

2006-07-14 Thread Arjan van de Ven
On Thu, 2006-07-13 at 17:18 -0700, Roland Dreier wrote:
 Arjan it does get harder if this is needed for your IB device to
 Arjan do more work, so that your swap device on your IB can take
 Arjan more IO's to free up ram..
 
 That's the classic problem, but it's more a matter of the consumer
 using GFP_NOIO in the right places.

GFP_NOIO isn't going to save you in the cases where the memory really is
running low and you need the memory to do more IO...



___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



[openib-general] [PATCHv2] OpenSM/osm_sa_path_record.c: Support SL in PathRecord requests based on whether or not QoS is enabled

2006-07-14 Thread Hal Rosenstock
OpenSM/osm_sa_path_record.c: Support SL in PathRecord requests based on
whether or not QoS is enabled

Signed-off-by: Hal Rosenstock [EMAIL PROTECTED]

Index: opensm/osm_sa_path_record.c
===
--- opensm/osm_sa_path_record.c (revision 8507)
+++ opensm/osm_sa_path_record.c (working copy)
@@ -66,6 +66,7 @@
 #include opensm/osm_helper.h
 #include opensm/osm_pkey.h
 #include opensm/osm_multicast.h
+#include opensm/osm_partition.h
 
 #define OSM_PR_RCV_POOL_MIN_SIZE64
 #define OSM_PR_RCV_POOL_GROW_SIZE   64
@@ -164,6 +165,7 @@ __osm_pr_rcv_get_path_parms(
   const osm_physp_t*   p_physp;
   const osm_physp_t*   p_dest_physp;
   const osm_switch_t*  p_sw;
+  const osm_prtn_t*p_prtn;
   const ib_port_info_t*p_pi;
   const cl_qmap_t* p_sw_tbl;
   ib_api_status_t  status = IB_SUCCESS;
@@ -174,6 +176,7 @@ __osm_pr_rcv_get_path_parms(
   uint8_t  required_mtu;
   uint8_t  required_rate;
   uint8_t  required_pkt_life;
+  uint8_t  sl;
   ib_net16_t   dest_lid;
 
   OSM_LOG_ENTER( p_rcv-p_log, __osm_pr_rcv_get_path_parms );
@@ -548,7 +551,6 @@ __osm_pr_rcv_get_path_parms(
   p_parms-mtu = mtu;
   p_parms-rate = rate;
   p_parms-pkt_life = pkt_life;
-  p_parms-sl = OSM_DEFAULT_SL;
 
   if( comp_mask  IB_PR_COMPMASK_RAWTRAFFIC 
   cl_ntoh32( p_pr-hop_flow_raw )  ( 131 ) )
@@ -560,8 +562,9 @@ __osm_pr_rcv_get_path_parms(
 {
   osm_log( p_rcv-p_log, OSM_LOG_ERROR,
__osm_pr_rcv_get_path_parms: ERR 1F1A: 
-   Ports do not share specified P_Key\n);
+   Ports do not share specified P_Key 0x%4x\n, cl_ntoh16(pkey));
   status = IB_NOT_FOUND;
+  goto Exit;
 }
   }
   else
@@ -573,9 +576,32 @@ __osm_pr_rcv_get_path_parms(
__osm_pr_rcv_get_path_parms: ERR 1F1B: 
Ports do not have any shared P_Keys\n);
   status = IB_NOT_FOUND;
+  goto Exit;
 }
   }
+
+  p_prtn = (osm_prtn_t *)cl_qmap_get(p_rcv-p_subn-prtn_pkey_tbl,
+ cl_ntoh16(pkey  ~0x8000));
+  if ( p_prtn == (osm_prtn_t *)cl_qmap_end(p_rcv-p_subn-prtn_pkey_tbl) )
+  {
+/* this may be possible when pkey tables are created somehow in
+   previous runs or things are going wrong here */
+sl = OSM_DEFAULT_SL;
+osm_log( p_rcv-p_log, OSM_LOG_VERBOSE,
+ __osm_pr_rcv_get_path_parms: ERR 1F1C: 
+ No partition found for P_Key 0x%04x - using default SL %d\n, 
cl_ntoh16(pkey), sl );
+  }
+  else
+sl = p_prtn-sl;
+
+  if ( ( comp_mask  IB_PR_COMPMASK_SL )  ib_path_rec_sl( p_pr ) != sl )
+  {
+status = IB_NOT_FOUND;
+goto Exit;
+  }
+
   p_parms-pkey = pkey;
+  p_parms-sl = sl;
 
  Exit:
   OSM_LOG_EXIT( p_rcv-p_log );
@@ -613,7 +639,7 @@ __osm_pr_rcv_build_pr(
   p_pr-slid = cl_hton16( src_lid_ho );
 
   p_pr-pkey = p_parms-pkey;
-  p_pr-sl = p_parms-sl;
+  p_pr-sl = cl_hton16(p_parms-sl);
   p_pr-mtu = (uint8_t)(p_parms-mtu | 0x80);
   p_pr-rate = (uint8_t)(p_parms-rate | 0x80);
 
@@ -987,18 +1013,6 @@ __osm_pr_rcv_get_end_points(
 into the endpoints.
   */
 
-  if( comp_mask  IB_PR_COMPMASK_SL )
-  {
-if( p_pr-sl != OSM_DEFAULT_SL )
-{
-  if ( p_sa_mad-method == IB_MAD_METHOD_GET )
-sa_status = IB_SA_MAD_STATUS_NO_RECORDS;
-  *pp_src_port = 0;
-  *pp_dest_port = 0;
-  goto Exit;
-}
-  }
-
   if( comp_mask  IB_PR_COMPMASK_SGID )
   {
 *pp_src_port = (osm_port_t*)cl_qmap_get(
@@ -1414,7 +1428,7 @@ __osm_pr_match_mgrp_attributes(
 
   if( comp_mask  IB_PR_COMPMASK_SL )
   {
-if( ( p_pr-sl  0xf ) != sl )
+if( ib_path_rec_sl( p_pr ) != sl )
   goto Exit;
   }
 
@@ -1422,7 +1436,7 @@ __osm_pr_match_mgrp_attributes(
   if( ( comp_mask  IB_PR_COMPMASK_NUMBPATH ) 
   ( p_sa_mad-method != IB_MAD_METHOD_GET ) )
   {
-if( ( p_pr-num_path  0x7f ) == 0 )
+if( ib_path_rec_num_path( p_pr ) == 0 )
   goto Exit;
   }
 




___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



[openib-general] [PATCHv2] OpenSM: Support configurable SL per partition

2006-07-14 Thread Hal Rosenstock
OpenSM: Support configurable SL per partition

Signed-off-by: Sasha Khapyorsky [EMAIL PROTECTED]
Signed-off-by: Hal Rosenstock [EMAIL PROTECTED]

Index: include/opensm/osm_partition.h
===
--- include/opensm/osm_partition.h  (revision 8520)
+++ include/opensm/osm_partition.h  (working copy)
@@ -102,6 +102,7 @@
 {
cl_map_item_t   map_item;
uint16_tpkey;
+   uint8_t sl; 
cl_map_tfull_guid_tbl;
cl_map_tpart_guid_tbl;
charname[32];
@@ -114,6 +115,9 @@
 *  pkey
 *  The IBA defined P_KEY of this Partition.
 *
+*  sl
+*  The Service Level (SL) associated with this Partiton.
+*
 *  port_guid_tbl
 *  Container of pointers to all Port objects in the Partition,
 *  indexed by port GUID.
Index: opensm/osm_prtn.c
===
--- opensm/osm_prtn.c   (revision 8520)
+++ opensm/osm_prtn.c   (working copy)
@@ -79,6 +79,7 @@
 
memset(p, 0, sizeof(*p));
p-pkey = pkey;
+   p-sl = OSM_DEFAULT_SL;
cl_map_construct(p-full_guid_tbl);
cl_map_init(p-full_guid_tbl, 32);
cl_map_construct(p-part_guid_tbl);
@@ -220,7 +221,7 @@
mc_rec.pkey = pkey;
mc_rec.rate = rate ? rate : 0x3; /* 10Gb/sec */
mc_rec.pkt_life = OSM_DEFAULT_SUBNET_TIMEOUT;
-   mc_rec.sl_flow_hop = OSM_DEFAULT_SL  28;
+   mc_rec.sl_flow_hop = ib_member_set_sl_flow_hop(p-sl, 0, 0);
/* Note: scope needs to be consistent with MGID */
mc_rec.scope_state = 0x21;
 
Index: opensm/osm_prtn_config.c
===
--- opensm/osm_prtn_config.c(revision 8520)
+++ opensm/osm_prtn_config.c(working copy)
@@ -52,8 +52,10 @@
 #include string.h
 #include errno.h
 #include ctype.h
+#include limits.h
 
 #include iba/ib_types.h
+#include opensm/osm_base.h
 #include opensm/osm_partition.h
 #include opensm/osm_subnet.h
 #include opensm/osm_log.h
@@ -82,7 +84,7 @@
osm_log_t  *p_log;
osm_subn_t *p_subn;
osm_prtn_t *p_prtn;
-   unsignedis_ipoib, mtu, rate;
+   unsignedis_ipoib, mtu, rate, sl;
 };
 
 
@@ -122,6 +124,16 @@
if (!conf-p_prtn)
return -1;
 
+   if (conf-p_subn-opt.no_qos) {
+   if (conf-sl != OSM_DEFAULT_SL) {
+   osm_log(conf-p_log, OSM_LOG_ERROR,
+   partition_create: Overriding SL %d to default 
SL %d on partition %s as QoS not enabled\n,
+   conf-sl, OSM_DEFAULT_SL, name);
+ conf-sl = OSM_DEFAULT_SL;
+   }
+   }
+   conf-p_prtn-sl = conf-sl;
+
if (conf-is_ipoib)
osm_prtn_add_mcgroup(conf-p_log, conf-p_subn, conf-p_prtn,
 conf-is_ipoib, conf-rate, conf-mtu);
@@ -145,6 +157,17 @@
PARSEWARN(conf-p_log, lineno,
flag \'rate\' requires valid value
 - skipped.\n);
+   } else if (!strncmp(flag, sl, len)) {
+   unsigned sl;
+   char *end;
+
+   if (!val || !*val || (sl = strtoul(val, end, 0))  15 ||
+   (*end  !isspace(*end)))
+   PARSEWARN(conf-p_log, lineno,
+   flag \'sl\' requires valid value
+- skipped.\n);
+   else
+   conf-sl = sl;
} else {
PARSEWARN(conf-p_log, lineno,
unrecognized partition flag \'%s\'
@@ -254,6 +277,8 @@
conf-p_log = p_log;
conf-p_subn = p_subn;
conf-p_prtn = NULL;
+   conf-is_ipoib = 0;
+   conf-sl = OSM_DEFAULT_SL;
return conf;
 }
 
Index: doc/partition-config.txt
===
--- doc/partition-config.txt(revision 8520)
+++ doc/partition-config.txt(working copy)
@@ -46,6 +46,7 @@
  result IPoIB capable MC group will be created.
 rate=val - specifies rate for this IPoIB MC group (default is 3 (10GBps))
 mtu=val  - specifies MTU for this IPoIB MC group (default is 4 (2048))
+sl=val   - specifies SL for this IPoIB MC group (default is 0)
 
 Note that values for 'rate' and 'mtu' should be specified as defined in
 IBTA specification (for example mtu=4 for 2048).




___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] [PATCH] [RFC] librdmacm: expose device list to users

2006-07-14 Thread Pradipta Kumar Banerjee
Sean Hefty wrote:
 The following patch adds calls to the userspace RDMA CM to return its list
 of RDMA devices.  The calls are similar to ibv_get_device_list() /
 ibv_free_device_list().
Thanks Sean for adding this functionality. This was needed.

Thanks,
Pradipta Kumar.
 
 Currently, RDMA device contexts are handed to the user only after they
 create an rdma_cm_id and bind it to a local device.  By exposing the device
 list to the user, it makes it easier for the user to allocate device
 specific resources (such as PDs, CQs, etc.) that are shared among multiple
 rdma_cm_id's.
 
 Signed-off-by: Sean Hefty [EMAIL PROTECTED]
 ---
 Index: include/rdma/rdma_cma.h
 ===
 --- include/rdma/rdma_cma.h   (revision 8215)
 +++ include/rdma/rdma_cma.h   (working copy)
 @@ -332,4 +332,20 @@ static inline uint16_t rdma_get_dst_port
   ((struct sockaddr_in *) id-route.addr.dst_addr)-sin_port;
  }
  
 +/**
 + * rdma_get_devices - Get list of RDMA devices currently available.
 + * @num_devices: If non-NULL, set to the number of devices returned.
 + *
 + * Return a NULL-terminated array of opened RDMA devices.  Callers can use 
 this
 + * routine to allocate resources on specific RDMA devices that will be shared
 + * across multiple rdma_cm_id's.
 + * The array must be released by calling rdma_free_devices().
 + */
 +struct ibv_context **rdma_get_devices(int *num_devices);
 +
 +/**
 + * rdma_free_devices - Frees the list of devices returned by 
 rdma_get_devices().
 + */
 +void rdma_free_devices(struct ibv_context **list);
 +
  #endif /* RDMA_CMA_H */
 Index: src/cma.c
 ===
 --- src/cma.c (revision 8517)
 +++ src/cma.c (working copy)
 @@ -216,6 +216,32 @@ err:
   return ret;
  }
  
 +struct ibv_context **rdma_get_devices(int *num_devices)
 +{
 + struct ibv_context **devs = NULL;
 + int i;
 +
 + if (!cma_dev_cnt  ucma_init())
 + goto out;
 +
 + devs = malloc(sizeof *devs * (cma_dev_cnt + 1));
 + if (!devs)
 + goto out;
 +
 + for (i = 0; i  cma_dev_cnt; i++)
 + devs[i] = cma_dev_array[i].verbs;
 + devs[i] = NULL;
 +out:
 + if (num_devices)
 + *num_devices = devs ? cma_dev_cnt : 0;
 + return devs;
 +}
 +
 +void rdma_free_devices(struct ibv_context **list)
 +{
 + free(list);
 +}
 +
  static void __attribute__((destructor)) rdma_cma_fini(void)
  {
   ucma_cleanup();
 Index: src/librdmacm.map
 ===
 --- src/librdmacm.map (revision 8215)
 +++ src/librdmacm.map (working copy)
 @@ -21,5 +21,7 @@ RDMACM_1.0 {
   rdma_get_dst_attr;
   rdma_join_multicast;
   rdma_leave_multicast;
 + rdma_get_devices;
 + rdma_free_devices;
   local: *;
  };
 
 
 ___
 openib-general mailing list
 openib-general@openib.org
 http://openib.org/mailman/listinfo/openib-general
 
 To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
 
 


___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] wiki update - howto for Chelsio's T3 RNIC

2006-07-14 Thread Steve Wise
  
 
 Steve,
 
 The two steps below fail:
 
 # (cd librdmacm  ./autogen.sh  ./configure  make  make install)
 # (cd libcxgb3  ./autogen.sh  ./configure  make  make install)
 
 
 with:
 checking for ibv_get_device_list in -libverbs... no
 configure: error: ibv_get_device_list() not found.  librdmacm requires 
 libibverbs.
 checking for ibv_get_device_list in -libverbs... no
 configure: error: ibv_get_device_list() not found.  libmthca requires 
 libibverbs.
 
 Should the configure script do -libibverbs instead of -libverbs ?
 

no.  The problem is your library search path doesn't
include /usr/local/lib.  Or, you didn't do an ldconfig after building
libibverbs.   Make sure /usr/local/lib is specified in /etc/ld.so.conf
or in a file in /etc/ld.so.conf.d.  Then run ldconfig.  You can verify
that its working be running ldconfig -v and seeing libibverbs in the
path...




___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] wiki update - howto for Chelsio's T3 RNIC

2006-07-14 Thread Steve Wise

Erf.  I see the bug.  SCE (stewpid coding error :).  The patch below
will solve the crash, but the issue really is that we're failing to
allocate our data structures.  To work around this for now, I suggest
you patch the cxgb3 src code to reduce the number of supported objects
in core/cxio_hal.h.  Namely T3_MAX_NUM_RI, T3_MAX_NUM_QP, etc... 



For the crash try this:


Index: iwch.c
===
--- iwch.c  (revision 8481)
+++ iwch.c  (working copy)
@@ -65,7 +65,8 @@
 static inline void *vzmalloc(int size)
 {
void *p = vmalloc(size);
-   memset(p, 0, size);
+   if (p)
+   memset(p, 0, size);
return p;
 }



Steve.



 [EMAIL PROTECTED] ~]# modprobe iw_cxgb3
 Segmentation fault
 
 [EMAIL PROTECTED] ~]# dmesg
 ...
 snip
 ...
 eth2: Chelsio T320 2x1BaseX RNIC (rev 0) PCI-X 133MHz/64-bit MSI-X
 eth2: 128MB CM, 256MB PMTX, 256MB PMRX
 eth3: Chelsio T320 2x1BaseX RNIC (rev 0) PCI-X 133MHz/64-bit MSI-X
 Unable to handle kernel paging request at virtual address 00248000
 modprobe[3348]: Oops 8804682956800 [1]
 Modules linked in: iw_cxgb3 cxgb3c ib_umad ib_ucm ib_uverbs ib_sa ib_cm 
 ib_mad ib_core cxgb3
 
 Pid: 3348, CPU 0, comm: modprobe
 psr : 1010081a6018 ifs : 8183 ip  : [a001003148e0]
 Not tainted
 ip is at memset+0x240/0x420
 unat:  pfs : 0593 rsc : 0003
 rnat: 00208000 bsps: 8000 pr  : 05550519
 ldrs:  ccv :  fpsr: 0009804c8a70433f
 csd :  ssd : 
 b0  : a0020022f540 b6  : a002000d44e0 b7  : a002000b9f00
 f6  : 1003e f7  : 1003e6db6db6db6db6db7
 f8  : 1003e071b5ed6 f9  : 1003e
 f10 : 1003e0002 f11 : 1003e
 r1  : a00100c80920 r2  : e040f98034f8 r3  : a00100a99f98
 r8  : 00248000 r9  : 6db6db6db6db6db7 r10 : 071b5ed6
 r11 : 38daf6b0 r12 : e04043897e30 r13 : e0404389
 r14 : e040f98034f8 r15 : 0040fa3e8000 r16 : a00200248000
 r17 : a0007fffc720 r18 : a00100a9c220 r19 : a00100a9c220
 r20 : 0040fffb8000 r21 : 0010 r22 : 0800
 r23 : 0007 r24 : 00248000 r25 : 0040fc02
 r26 : a00202248000 r27 : 00248010 r28 : 00288000
 r29 : e040fc02 r30 :  r31 : 07ff
 
 Call Trace:
   [a00100010b50] show_stack+0x50/0xa0
  sp=e040438979c0 bsp=e040438912a8
   [a00100011420] show_regs+0x820/0x840
  sp=e04043897b90 bsp=e04043891260
   [a00100035990] die+0x1d0/0x2e0
  sp=e04043897b90 bsp=e04043891218
   [a00100754de0] ia64_do_page_fault+0x8e0/0xa00
  sp=e04043897bb0 bsp=e040438911b8
   [a001b880] ia64_leave_kernel+0x0/0x280
  sp=e04043897c60 bsp=e040438911b8
   [a001003148e0] memset+0x240/0x420
  sp=e04043897e30 bsp=e040438911a0
   [a0020022f540] open_rnic_toe+0x140/0x620 [iw_cxgb3]
  sp=e04043897e30 bsp=e04043891148
   [a00200120800] t3c_register_client+0x140/0x1e0 [cxgb3c]
  sp=e04043897e30 bsp=e04043891118
   [a002000884a0] iwch_init_module+0xc0/0x100 [iw_cxgb3]
  sp=e04043897e30 bsp=e04043891100
   [a001000c1eb0] sys_init_module+0x250/0x520
  sp=e04043897e30 bsp=e04043891088
   [a001b6e0] ia64_ret_from_syscall+0x0/0x20
  sp=e04043897e30 bsp=e04043891088
   [a0010640] ia64_ivt+0x00010640/0x400
  sp=e04043898000 bsp=e04043891088
   BUG: modprobe/3348, lock held at task exit time!
   [a00200129440] {t3cdev_db_lock}
 .. held by:  modprobe: 3348 [e0404389, 116]
 ... acquired at:   t3c_register_client+0x30/0x1e0 [cxgb3c]
 
 


___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] wiki update - howto for Chelsio's T3 RNIC

2006-07-14 Thread Steve Wise
BTW:  I updated the wiki to mention this issue.



On Fri, 2006-07-14 at 08:43 -0500, Steve Wise wrote:
   
  
  Steve,
  
  The two steps below fail:
  
  # (cd librdmacm  ./autogen.sh  ./configure  make  make install)
  # (cd libcxgb3  ./autogen.sh  ./configure  make  make install)
  
  
  with:
  checking for ibv_get_device_list in -libverbs... no
  configure: error: ibv_get_device_list() not found.  librdmacm requires 
  libibverbs.
  checking for ibv_get_device_list in -libverbs... no
  configure: error: ibv_get_device_list() not found.  libmthca requires 
  libibverbs.
  
  Should the configure script do -libibverbs instead of -libverbs ?
  
 
 no.  The problem is your library search path doesn't
 include /usr/local/lib.  Or, you didn't do an ldconfig after building
 libibverbs.   Make sure /usr/local/lib is specified in /etc/ld.so.conf
 or in a file in /etc/ld.so.conf.d.  Then run ldconfig.  You can verify
 that its working be running ldconfig -v and seeing libibverbs in the
 path...
 
 
 
 
 ___
 openib-general mailing list
 openib-general@openib.org
 http://openib.org/mailman/listinfo/openib-general
 
 To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
 


___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



[openib-general] Problem with svn checkout

2006-07-14 Thread Makia Minich
From the archive, I see that back in March someone was having a svn checkout
problem (http://openib.org/pipermail/openib-general/2006-March/019134.html).
Was there ever a solution? Because I'm having an issue doing a fresh
checkout of the trunk:

svn: In directory 'trunk/src/userspace/mpi/mvapich-gen2/www/www1'
svn: Can't copy 
'trunk/src/userspace/mpi/mvapich-gen2/www/www1/.svn/tmp/text-base/mpicc.html
.svn-base' to 
'trunk/src/userspace/mpi/mvapich-gen2/www/www1/mpicc.html.tmp': No such file
or directory

Hopefully I just didn't find the answer and it's a simple solution.

Thanks,
Makia


___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] Problem with svn checkout

2006-07-14 Thread Makia Minich
Hmmm... Nevermind, it would appear that this is an issue with my subversion
client on OS X.  I guess I need to figure out what is wrong there.  Sorry
for the disruption.

On 7/14/06 10:13 AM, Makia Minich [EMAIL PROTECTED] wrote:

 From the archive, I see that back in March someone was having a svn checkout
 problem (http://openib.org/pipermail/openib-general/2006-March/019134.html).
 Was there ever a solution? Because I'm having an issue doing a fresh
 checkout of the trunk:
 
 svn: In directory 'trunk/src/userspace/mpi/mvapich-gen2/www/www1'
 svn: Can't copy 
 'trunk/src/userspace/mpi/mvapich-gen2/www/www1/.svn/tmp/text-base/mpicc.html
 .svn-base' to 
 'trunk/src/userspace/mpi/mvapich-gen2/www/www1/mpicc.html.tmp': No such file
 or directory
 
 Hopefully I just didn't find the answer and it's a simple solution.
 
 Thanks,
 Makia
 
 
 ___
 openib-general mailing list
 openib-general@openib.org
 http://openib.org/mailman/listinfo/openib-general
 
 To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
 



___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] [PATCH] [RFC] librdmacm: expose device list to users

2006-07-14 Thread Sean Hefty
Pradipta Kumar Banerjee wrote:
 Thanks Sean for adding this functionality. This was needed.

This was committed to svn 8523.

- Sean

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread Sean Hefty
somenath wrote:
 1. if ib_poll_cq(cq, 1, wc) returns zero, does wc contain a valid entry?

no

  * Poll a CQ for (possibly multiple) completions.  If the return value
  * is  0, an error occurred.  If the return value is = 0, it is the
  * number of completions returned.  If the return value is
  * non-negative and strictly less than num_entries, then the CQ was
  * emptied.

- Sean

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread somenath
Sean Hefty wrote:

 somenath wrote:

 1. if ib_poll_cq(cq, 1, wc) returns zero, does wc contain a valid 
 entry?


 no

  * Poll a CQ for (possibly multiple) completions.  If the return value
  * is  0, an error occurred.  If the return value is = 0, it is the
  * number of completions returned.  If the return value is
  * non-negative and strictly less than num_entries, then the CQ was
  * emptied.

 - Sean


Thanks Sean!

As per the above definiton no error occurred since return value is 0..
And as per your clarification, wc doesn't contain a valid entry.

So  my next question:

2. why is the io completion routine called when ib_poll_cq() returns zero? 
does this kind of notification contain any information?
is there some error happening here? what are some possible problem areas?
any wild guess...?

anyone else seen this kind of stuff?

( I get send completion and recv completion both your 0 return value).

thanks, som.



___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread Sean Hefty
somenath wrote:
 2. why is the io completion routine called when ib_poll_cq() returns 
 zero? does this kind of notification contain any information?
 is there some error happening here? what are some possible problem areas?
 any wild guess...?

Can you clarify what's happening?  Are you calling ib_poll_cq() from your 
ib_comp_handler() and not finding a completion?

- Sean

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread Rimmer, Todd
 From: Sean Hefty
 somenath wrote:
  2. why is the io completion routine called when ib_poll_cq() returns
  zero? does this kind of notification contain any information?
  is there some error happening here? what are some possible problem
 areas?
  any wild guess...?
 
 Can you clarify what's happening?  Are you calling ib_poll_cq() from
your
 ib_comp_handler() and not finding a completion?

Be aware that an IB completion handler should be coded as:

while poll_cq returns a completion
process completion
rearm CQ
while poll_cq returns a completion
process completion

Per the IBTA spec, rearm CQ is not required to generate an event for
completions already on the CQ, just for newly arriving ones (Mellanox
HCAs will generate a completion event if any CQEs remain on the CQ).

Due to various race situations between the HCA and the software/ULP,
there are some valid cases where a completion event could occur after
poll_cq has already processed the completion.

Todd Rimmer

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread somenath
Sean Hefty wrote:

 somenath wrote:

 2. why is the io completion routine called when ib_poll_cq() returns 
 zero? does this kind of notification contain any information?
 is there some error happening here? what are some possible problem 
 areas?
 any wild guess...?


 Can you clarify what's happening?  Are you calling ib_poll_cq() from 
 your ib_comp_handler() and not finding a completion?

 - Sean

That's right Sean. I establish a RC connection and pre-post a buffer of 
size 4K in one side,
and try to send 4K packet from the other side. each side gets a 
completion event when ib_poll_cq()
rets zero.

ib_recv_comp_hanlder(cq, arg) {
struct ib_wc wc;

  ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); //i check for errors, there is 
no error;
 
  count = ib_poll_cq(cq, 1, wc);
  // here I find count = 0
}

same happens to send_comp_handler too..

so, everytime I get a cq notification, I get a zero entry (send and recv 
completion
event occurs on respective nodes).

thanks, som.

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread Rimmer, Todd
 From: somenath [mailto:[EMAIL PROTECTED]
 Rimmer, Todd wrote:
 
 Be aware that an IB completion handler should be coded as:
 
 while poll_cq returns a completion
  process completion
 rearm CQ
 
 
 
 that's what I am doing, to be more specific:
 
 rearm CQ;
 while (ib_poll_cq(cq, 1, wc)  0) {
  process completion();
 }
 
 is that what you meant?
 
 while poll_cq returns a completion
  process completion
 
 in my case, poll_cq() always returns 0, so I never get a valid wc
entry...
 

I'm not sure why you are never getting a valid wc entry, however by
coding it as you have, on mellanox hardware, you will always get an
extra completion event.

You should code it as:

while (ib_poll_cq(cq, 1, wc)  0) {
 process completion();
}
rearm CQ
while (ib_poll_cq(cq, 1, wc)  0) {
 process completion();
}

If you do the rearm CQ first, mellanox HCAs will find there are CQEs
still on the CQ, and generate another completion event.

Hence you should always poll 1st, then rearm, then poll again to make
sure there were no CQEs arriving racing with the rearm.

Todd Rimmer

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread somenath
somenath wrote:

Rimmer, Todd wrote:

  

From: Sean Hefty
somenath wrote:
   

  

2. why is the io completion routine called when ib_poll_cq() returns
zero? does this kind of notification contain any information?
is there some error happening here? what are some possible problem
 



areas?
   

  

any wild guess...?
 



Can you clarify what's happening?  Are you calling ib_poll_cq() from
   

  

your
 



ib_comp_handler() and not finding a completion?
   

  

Be aware that an IB completion handler should be coded as:

while poll_cq returns a completion
  process completion
rearm CQ
 




that's what I am doing, to be more specific:

rearm CQ;
while (ib_poll_cq(cq, 1, wc)  0) {
 process completion();
}

is that what you meant?
  


just to make sure I conveyed the exact thing I meant, if I change
the above code as follows:

while (ib_poll_cq(cq, 1, wc)  0) {
 process completion();
}
rearm CQ;

then I just get notification once, and don't get any futher
notifications...so I assume rearm CQ should be done even if
ib_poll_cq() returns zero.

thanks, som.


  

while poll_cq returns a completion
  process completion

Per the IBTA spec, rearm CQ is not required to generate an event for
completions already on the CQ, just for newly arriving ones (Mellanox
HCAs will generate a completion event if any CQEs remain on the CQ).

Due to various race situations between the HCA and the software/ULP,
there are some valid cases where a completion event could occur after
poll_cq has already processed the completion.
 




in my case, poll_cq() always returns 0, so I never get a valid wc entry...

  

Todd Rimmer
 





___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

  



___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread Rimmer, Todd

 From: somenath [mailto:[EMAIL PROTECTED]
 
 just to make sure I conveyed the exact thing I meant, if I change
 the above code as follows:
 
 while (ib_poll_cq(cq, 1, wc)  0) {
  process completion();
 }
 rearm CQ;
 
 then I just get notification once, and don't get any futher
 notifications...so I assume rearm CQ should be done even if
 ib_poll_cq() returns zero.

Yes, if you don't rearm you would never get another completion for the
CQ.  However it is not yet clear why you are not finding the initial
completion when your 1st callback invoked poll_cq.

Todd Rimmer

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread Sean Hefty
somenath wrote:
 just to make sure I conveyed the exact thing I meant, if I change
 the above code as follows:
 
 while (ib_poll_cq(cq, 1, wc)  0) {
  process completion();
 }
 rearm CQ;
 
 then I just get notification once, and don't get any futher
 notifications...so I assume rearm CQ should be done even if
 ib_poll_cq() returns zero.

Hmm... what HCA cards and svn version are you using?  Mellanox cards generate 
new events if any completions remain on the CQ.  So, if ib_poll_cq() isn't 
finding any completions, then another event should be generated.  If 
ib_poll_cq() is returning a completion, then it sounds like there's a bug with 
ib_poll_cq() returning the wrong number of completions.

- Sean

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread somenath
Sean Hefty wrote:

 somenath wrote:

 just to make sure I conveyed the exact thing I meant, if I change
 the above code as follows:

 while (ib_poll_cq(cq, 1, wc)  0) {
  process completion();
 }
 rearm CQ;

 then I just get notification once, and don't get any futher
 notifications...so I assume rearm CQ should be done even if
 ib_poll_cq() returns zero.


 Hmm... what HCA cards and svn version are you using?  Mellanox cards 
 generate new events if any completions remain on the CQ.  So, if 
 ib_poll_cq() isn't finding any completions, then another event should 
 be generated.  If ib_poll_cq() is returning a completion, then it 
 sounds like there's a bug with ib_poll_cq() returning the wrong number 
 of completions.

 - Sean


hardware config: PCI express machine with PCI-Express cards of Mellanox 
HCA.

Exact same configuration was working with gen1 stack (no change in the 
hardware setup).
you can find rest of the details here:
===
[EMAIL PROTECTED] bin]# ./ibstat
CA 'mthca0'
CA type: MT25208 (MT23108 compat mode)
Number of ports: 2
Firmware version: 4.6.0
Hardware version: a0
Node GUID: 0x0005ad039abc
System image GUID: 0x0005ad000100d050
Port 1:
State: Active
Physical state: LinkUp
Rate: 10
Base lid: 114
LMC: 0
SM lid: 1
Capability mask: 0x00510a68
Port GUID: 0x0005ad039abd
Port 2:
State: Down
Physical state: Polling
Rate: 10
Base lid: 0
LMC: 0
SM lid: 0
Capability mask: 0x00510a68
Port GUID: 0x0005ad039abe
=

I am using the version of gen2 code as distributed in RH version 4 update 3
since I am trying to get a port on this version of the distribution.
(so don't know the exact svn version they pulled from, I just use the RPM's
in their binary distribution).

thanks, som.




___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] [PATCH 28 of 39] IB/ipath - Fixes a bug where our delay for EEPROM no longer works due to compiler reordering

2006-07-14 Thread Ralph Campbell
static void i2c_wait_for_writes(struct ipath_devdata *dd)
{
   +  mb();
  (void)ipath_read_kreg32(dd, dd-ipath_kregs-kr_scratch);
}

  That's a bit weird.  I wouldn't have expected the compiler to muck around
  with a readl().

 I never liked this patch.  The last time it came up there were
 conflicting answers about whether it was a code generation bug or a
 real issue talking to hardware or what.  At the least I think this
 merits a big comment explain what's going on -- and even better would
 be really understanding the bug that's being fixed so that we're
 confident it is indeed a real fix.
 
 - R.

I haven't been able to reproduce the bug with -Os even on the systems
where it used to fail so I guess this is a non-issue.
Please ignore this patch.


___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] wiki update - howto for Chelsio's T3 RNIC

2006-07-14 Thread Louis Laborde
Steve Wise wrote:
 Steve,

 The two steps below fail:

 # (cd librdmacm  ./autogen.sh  ./configure  make  make install)
 # (cd libcxgb3  ./autogen.sh  ./configure  make  make install)


 with:
 checking for ibv_get_device_list in -libverbs... no
 configure: error: ibv_get_device_list() not found.  librdmacm requires 
 libibverbs.
 checking for ibv_get_device_list in -libverbs... no
 configure: error: ibv_get_device_list() not found.  libmthca requires 
 libibverbs.

 Should the configure script do -libibverbs instead of -libverbs ?

 
 no.  The problem is your library search path doesn't
 include /usr/local/lib.  Or, you didn't do an ldconfig after building
 libibverbs.   Make sure /usr/local/lib is specified in /etc/ld.so.conf
 or in a file in /etc/ld.so.conf.d.  Then run ldconfig.  You can verify
 that its working be running ldconfig -v and seeing libibverbs in the
 path...
 
 
 

Steve,

I found my problem: I am using RHEL-U3 which comes with other version of
the libibverbs library in package libibverbs-devel. This library is installed
in /usr/lib and does not define the function ibv_get_device_list.

I removed the package and dependencies with:
rpm -e udapl udapl-devel libibverbs libibverbs-utils libibverbs-devel

Is there a way to have the old and new openib libs/tools coexist?

Thanks,
Louis

+-+
| Louis LABORDE  e-mail: [EMAIL PROTECTED] |
| HP Cupertino SISL  phone:(408) 447-3649 |
+-+

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread Sean Hefty
Can you also post your code, including the completion handler routines and QP 
creation / initialization sections?

- Sean

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread somenath
Sean Hefty wrote:

 Can you also post your code, including the completion handler routines 
 and QP creation / initialization sections?

 - Sean


curretnly this is what I am trying to test, just to get one notification 
successfully with a proper wc element.
thanks, som.


int
recvio_complete(
struct  ib_cq *cq,
void *passed_arg) {
io_complete(cq, passed_arg);
}

int
sendio_complete(
struct  ib_cq *cq,
void *passed_arg) {
io_complete(cq, passed_arg);
}

int
io_complete(
struct  ib_cq *cq,
void *passed_arg)
{
xxx_connection_t*arg = passed_arg;
xxx_status_tstat = xxx_st_ok;
struct ib_wc wc;
int count = 0;

if (count = ib_poll_cq(cq, 1, wc)  0) {
stat = xxx_st_error;
arg = NULL;
goto error;
}
if(stat = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP)) {
stat = xxx_st_error;
goto error;
}
if (count == 0) {
stat = xxx_st_ok;
goto error;
}


if ((arg != NULL)  (cq != NULL)) {
io_process(conn_handle);
} else {
stat = xxx_st_error;
goto error;
}
error:
return stat;
}

xxx_status_t
xxx_create_qpairs(
  xxx_connection_t *conn_handle)
{
xxx_status_txxx_stat = xxx_st_ok;
int ib_stat = 0;
int entries_send, entries_recv;
struct ib_qp_init_attr qp_attr =  { 0 };
unsignedlongflags;

conn_handle-llp_send_cq =
ib_create_cq(
xxx_openib.xxx_device,
sendio_complete,
xxx_async_cb,
conn_handle,
XXX_SEND_CQ_DEPTH);
if (IS_ERR(conn_handle-llp_send_cq)) {
ib_stat = PTR_ERR(conn_handle-llp_send_cq);
xxx_stat = map_ib_to_xxx_stat(ib_stat);
goto free_out;
}

conn_handle-llp_recv_cq =
ib_create_cq(
xxx_openib.xxx_device,
recvio_complete,
xxx_async_cb,
conn_handle,
XXX_RECV_CQ_DEPTH);
if (IS_ERR(conn_handle-llp_recv_cq)) {
ib_stat = PTR_ERR(conn_handle-llp_recv_cq);
xxx_stat = map_ib_to_xxx_stat(ib_stat);
ib_destroy_cq(conn_handle-llp_send_cq);
goto free_out;
}
if(ib_stat = ib_req_notify_cq(conn_handle-llp_send_cq, 
IB_CQ_NEXT_COMP)) {
xxx_stat = map_ib_to_xxx_stat(ib_stat);
ib_destroy_cq(conn_handle-llp_send_cq);
ib_destroy_cq(conn_handle-llp_recv_cq);
goto free_out;
}
if(ib_stat = ib_req_notify_cq(conn_handle-llp_recv_cq, 
IB_CQ_NEXT_COMP)) {
xxx_stat = map_ib_to_xxx_stat(ib_stat);
ib_destroy_cq(conn_handle-llp_recv_cq);
goto free_out;
}

memset(qp_attr, 0, sizeof qp_attr);
qp_attr.event_handler = xxx_qevent_cb;
qp_attr.qp_context = conn_handle;
qp_attr.send_cq = conn_handle-llp_send_cq;
qp_attr.recv_cq = conn_handle-llp_recv_cq;
 
qp_attr.cap.max_send_wr = XXX_SENDQ_DEPTH;
qp_attr.cap.max_recv_wr = XXX_RECVQ_DEPTH;
qp_attr.cap.max_send_sge = 1;
qp_attr.cap.max_recv_sge = 1;
qp_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
qp_attr.qp_type = IB_QPT_RC;
qp_attr.port_num = HCA_PRM_PORT; /* special QP types only */

conn_handle-llp_qp =
ib_create_qp(
xxx_openib.xxx_global_pd,
qp_attr);
if (IS_ERR(conn_handle-llp_qp)) {
ib_destroy_cq(conn_handle-llp_send_cq);
ib_destroy_cq(conn_handle-llp_recv_cq);
ib_stat = PTR_ERR(conn_handle-llp_qp);
xxx_stat = map_ib_to_xxx_stat(ib_stat);
goto free_out;
}
xxx_modifyqp_init(conn_handle);

 free_out:
  return (xxx_stat);
}   102,1 68%
  1,1   Top


___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] wiki update - howto for Chelsio's T3 RNIC

2006-07-14 Thread Louis Laborde
Steve Wise wrote:
 All,
 
 I added a quick how to on the wiki for the Chelsio T3 rnic...
 
 Steve.
 
 
 
 ___
 openib-general mailing list
 openib-general@openib.org
 http://openib.org/mailman/listinfo/openib-general
 
 To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
 

Steve,

I finally got rping working with Chelsio T3 on IA64 platform.

Which tools should I use to do bandwidth and latency measurements?
Is it covered in some FAQ?

Thanks,
Louis

+-+
| Louis LABORDE  e-mail: [EMAIL PROTECTED] |
| HP Cupertino SISL  phone:(408) 447-3649 |
+-+

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread Sean Hefty
somenath wrote:
 int
 io_complete(
struct  ib_cq *cq,
void *passed_arg)
 {
xxx_connection_t*arg = passed_arg;
xxx_status_tstat = xxx_st_ok;
struct ib_wc wc;
int count = 0;
 
if (count = ib_poll_cq(cq, 1, wc)  0) {

I think this evaluates ib_poll_cq(..)  0 before doing the assignment.  Since 
the expression evaluates to false, count is assigned 0.  Can you try modifying 
this to:

if ((count = ib_poll_cq(..))  0)

- Sean

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread somenath
Sean Hefty wrote:

 somenath wrote:

 int
 io_complete(
struct  ib_cq *cq,
void *passed_arg)
 {
xxx_connection_t*arg = passed_arg;
xxx_status_tstat = xxx_st_ok;
struct ib_wc wc;
int count = 0;

if (count = ib_poll_cq(cq, 1, wc)  0) {


 I think this evaluates ib_poll_cq(..)  0 before doing the 
 assignment.  Since the expression evaluates to false, count is 
 assigned 0.  Can you try modifying this to:

 if ((count = ib_poll_cq(..))  0)

 - Sean

I added that stuff, but it didn't make a difference...it still returned 
0

thanks anyway for suggesting

looks like, I may be getting a valid entry even if ib_poll_cq() is 
returning zero...
is that possible?

I am going to try it anyway, even if the ib_poll_cq() returns zero, go 
ahead and try to use the wc..





dump of wc entry from recv side, looks like many are valid entries (like 
wr_id, length, op etc..)

wr_id=0x10118163c00 status=0x0 op=0x80
vendor_err=0x246 byte_len=0x2028 imm_data=0xa0367eb8
qp_num=0x404 src_qp=0x68 wc_flags=0x0
pkey_index=0x0 slid=0x0 sl=0x0
dlid_path_bits=0x0 port_num=0xff

som.


___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread Sean Hefty
somenath wrote:
 I think this evaluates ib_poll_cq(..)  0 before doing the 
 assignment.  Since the expression evaluates to false, count is 
 assigned 0.  Can you try modifying this to:

 if ((count = ib_poll_cq(..))  0)

 - Sean
 
 
 I added that stuff, but it didn't make a difference...it still returned 
 0

If ib_poll_cq() is truly returning 0, but with a valid wc, then this is a bug. 
(I'm pretty sure that the additional parens are needed in any case, and you'll 
also want to swap your ib_poll_cq and ib_req_notify calls back around.)  I have 
never seen this issue, and I'm sure that we would have heard about it if others 
had.  Nothing obvious jumped out at me when looking at the mthca completion 
code.

Maybe you've tried this, but can you break the count = ib_poll_cq() out from 
the 
if statement, print count and the wc structure immediately before and after the 
call, and post the results?

 looks like, I may be getting a valid entry even if ib_poll_cq() is 
 returning zero...
 is that possible?

You should not be getting a valid wc if the call returns 0.  For most 
applications this would result in a lost completion.

 dump of wc entry from recv side, looks like many are valid entries (like 
 wr_id, length, op etc..)

This could just be left over data from the stack, but...

 wr_id=0x10118163c00 status=0x0 op=0x80

Does the wr_id match the value that you set on the work request?  (I'm assuming 
that it does based on your previous comment.)  The opcode does match IB_WC_RECV.

- Sean

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] is wc valid if ib_poll_cq() returns zero

2006-07-14 Thread somenath
Sean Hefty wrote:

 somenath wrote:

 I think this evaluates ib_poll_cq(..)  0 before doing the 
 assignment.  Since the expression evaluates to false, count is 
 assigned 0.  Can you try modifying this to:

 if ((count = ib_poll_cq(..))  0)

 - Sean



 I added that stuff, but it didn't make a difference...it still 
 returned 0


 If ib_poll_cq() is truly returning 0, but with a valid wc, then this 
 is a bug. (I'm pretty sure that the additional parens are needed in 
 any case, and you'll also want to swap your ib_poll_cq and 
 ib_req_notify calls back around.)  I have never seen this issue, and 
 I'm sure that we would have heard about it if others had.  Nothing 
 obvious jumped out at me when looking at the mthca completion code.

 Maybe you've tried this, but can you break the count = ib_poll_cq() 
 out from the if statement, print count and the wc structure 
 immediately before and after the call, and post the results?


I tried this code too...(an retrying just now) and seeing it still 
returns 0...

 count = ib_poll_cq(cq, 1, wc);
if (count   0) {
  stat = xxx_st_error;
  goto error;   //it has never gone to error 
from this location.
}

I am ready to try any other suggestions and will change this portion of 
the code too,
but currently I am just trying to get a valid wc handle with 
ib_poll_cq() returning 1!!!
that never happens in anyway I try.


 looks like, I may be getting a valid entry even if ib_poll_cq() is 
 returning zero...
 is that possible?


 You should not be getting a valid wc if the call returns 0.  For most 
 applications this would result in a lost completion.

 dump of wc entry from recv side, looks like many are valid entries 
 (like wr_id, length, op etc..)


 This could just be left over data from the stack, but...

 wr_id=0x10118163c00 status=0x0 op=0x80


 Does the wr_id match the value that you set on the work request?  (I'm 
 assuming that it does based on your previous comment.)  The opcode 
 does match IB_WC_RECV.

'
yes, wr_id, lenght, opcode, status match...

next, i will check the data 

 - Sean



___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] Suggestions for how to remove bus_to_virt()

2006-07-14 Thread Ralph Campbell
On Thu, 2006-07-13 at 08:46 +0300, Muli Ben-Yehuda wrote:
 On Wed, Jul 12, 2006 at 05:40:13PM -0700, David Miller wrote:
  From: Roland Dreier [EMAIL PROTECTED]
  Date: Wed, 12 Jul 2006 17:11:26 -0700
  
   A cleaner solution would be to make the dma_ API really use the device
   it's passed anyway, and allow drivers to override the standard PCI
   stuff nicely.  But that would be major surgery, I guess.
  
  Clean but expensive, you should not force the rest of the kernel
  to eat the cost of something you want to do when it's totally
  unnecessary for most other users.
  
  For example, x86 never needs to do anything other than a direct
  virt_to_phys translation to produce a DMA address, no matter what
  bus the device is on.  It's a single simple integer adjustment
  that can be done inline in about 2 or 3 instructions at most.
 
 It's possible that even x86 will support multiple IOMMUs in the future
 - for example, the Calgary IOMMU support we recently added to x86-64
 could be modified to work on plain x86 as well.
 
 I like the idea of a per-device DMA-API implementation, but only if it
 can be done in a way that is zero cost to the majority of the users of
 the API. We already have dynamic dma_ops on x86-64 to support nommu,
 swiotlb, gart and Calgary cleanly, extending it to use a per-device
 dma-ops isn't too difficult.
 
 Cheers,
 Muli

A per-device DMA-API would solve my problem.
It would be a fairly invasive changeset though.
The basic idea would be to add a struct dma_mapping_ops *
to struct device and change all the inline dma_* routines
to something like:

static inline dma_addr_t
dma_map_single(struct device *hwdev, void *ptr, size_t size,
   int direction)
{
BUG_ON(!valid_dma_direction(direction));
return hwdev-dma_ops ? 
hwdev-dma_ops-map_single(hwdev, ptr, size, direction) :
dma_ops-map_single(hwdev, ptr, size, direction);
}

Note that the current design only supports one IOMMU type in a system.
This could support multiple IOMMU types at the same time.

Another possibility is to only do this for the infiniband subsystem.
The idea would be to replace calls to dma_* with ib_dma_* which
would be defined as above.


___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] Suggestions for how to remove bus_to_virt()

2006-07-14 Thread David Miller
From: Ralph Campbell [EMAIL PROTECTED]
Date: Fri, 14 Jul 2006 15:27:07 -0700

 Note that the current design only supports one IOMMU type in a system.
 This could support multiple IOMMU types at the same time.

This is not true, the framework allows multiply such types
and in fact Sparc64 takes advantage of this.  We have about
4 or 5 different PCI controllers, and the IOMMUs are slightly
different in each.

Even with the standard PCI DMA mapping calls, we can gather the
platform private information necessary to program the IOMMU
appropriately for a given chipset.

The dma_mapping_ops idea will never get accepted by folks like Linus,
for reasons I've outlined in previous emails in this thread.  So, it's
best to look elsewhere for solutions to your problem, such as the
ideas used by the USB and IEE1394 device layers.

___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] ping problem with ammassocards(iWARPinterface)

2006-07-14 Thread Ravinandan Arakali
As Pradipta suggested, I rebuilt the libraries by removing
the optimization(-O2 flag) from Makefile. Now, I don't see the
core dump but there's no connection established with rping.
This is similar to the failure I am seeing with rdma_lat test.

BTW, when I start the rping in server mode, at say port ,
should I expect to see an entity listening on that port number
when I do netstat -an. Currently, I don't see that.

Ravi

-Original Message-
From: Steve Wise [mailto:[EMAIL PROTECTED]
Sent: Thursday, July 13, 2006 12:10 PM
To: [EMAIL PROTECTED]
Cc: [EMAIL PROTECTED]; openib-general@openib.org
Subject: Re: [openib-general] ping problem with
ammassocards(iWARPinterface)


By the way, does this failure happen immediately or after some period of
time?


On Thu, 2006-07-13 at 13:27 -0500, Steve Wise wrote:
 I guess this isn't surprising since rping doesn't work for you either.
 Something fundamental is screwed up on your user side methinks...

 CM event 8 == RDMA_CM_EVENT_REJECTED which means either the server side
 wasn't listening on the appropriate TCP port, or the server process did
 an rdma_reject().  I'm guessing its the former...

 You could use tcpdmp and to see if the connection request is getting RST
 by the remote side.




 On Thu, 2006-07-13 at 11:20 -0700, Ravinandan Arakali wrote:
  With the --cma option, I don't see the error about running SM.
  But there's no connection established.
 
  openfab2:/tmp/ib/src/userspace/perftest # ./rdma_lat --cma
  pp_server_connect_cma starting server
 
  openfab:/tmp/ib/src/userspace/perftest # ./rdma_lat --cma 17.2.2.102
  pp_client_connect_cma starting client
  pp_client_connect_cma/856 unexpected CM event 8
  pp_client_connect_cma NOT connected!
  pp_connect_cma(17.2.2.102,18515) failed!
 
  There are no messages in dmesg either.
 
  Ravi
 
  -Original Message-
  From: Steve Wise [mailto:[EMAIL PROTECTED]
  Sent: Thursday, July 13, 2006 6:55 AM
  To: Ravinandan Arakali
  Cc: [EMAIL PROTECTED]; openib-general@openib.org
  Subject: Re: [openib-general] ping problem with ammasso
  cards(iWARPinterface)
 
 
  Are you trying to run this over iwarp?  It doesn't need an SM...
 
  For the perftests rdma_lat and rdma_bw in the iwarp branch, use the
  --cma flag.
 
  Steve.
 
 
  On Wed, 2006-07-12 at 16:39 -0700, Ravinandan Arakali wrote:
   Also, I am trying to run some of the iwarp bandwidth/latency tests
   (available under directory perftest).
   The first thing to do here is to run opensm. When I run opensm (with
debug
   level 10), I get the following error. Any idea what needs to be done
to
  get
   this working ?
  
   openfab2:/tmp/ib/src/userspace # opensm  -d 10
   -
   OpenSM Rev:openib-1.2.0
   Command Line Arguments:
d level = 0xa
Log File: /var/log/osm.log
   -
   OpenSM Rev:openib-1.2.0
  
   Using default GUID 0x0
   Error: Could not get port guid
   Exiting SM
  
   openfab2:/tmp/ib/src/userspace # cat /var/log/osm.log
   Jul 12 08:35:04 718914 [B7E518C0] - OpenSM Rev:openib-1.2.0
   Jul 12 08:35:04 719111 [] - OpenSM Rev:openib-1.2.0
  
   Jul 12 08:35:04 721381 [B7E518C0] - osm_sa_mad_ctrl_unbind: ERR 1A11:
No
   previous bind
   Jul 12 08:35:04 721702 [] - Exiting SM
  
  
  
  
  
   -Original Message-
   From: Pradipta Kumar Banerjee [mailto:[EMAIL PROTECTED]
   Sent: Wednesday, July 12, 2006 10:31 AM
   To: Ravinandan Arakali
   Cc: openib-general@openib.org
   Subject: Re: [openib-general] ping problem with ammasso cards(iWARP
   interface)
  
  
   Ravinandan,
 Do you still see the rping crash?
  
   Thanks,
   Pradipta Kumar.
  
   Ravinandan Arakali wrote:
Pradipta,
Okay, thanks.. Initially, I was not sure since I don't remember
non-zero
values in /proc/krping. When I re-ran the krping test, I see
following
output
openfab2:~ # cat /proc/krping
1-amso0 891376 55711 891376 55711 1782720 27855 1782784 27856
   
As you mentioned, the RDMA traffic seems to be flowing indeed !
Any idea why rping is dumping core ?
   
Has any testing been done using SDP with ammasso cards ?
   
Regards,
Ravi
   
   
-Original Message-
From: Pradipta Kumar Banerjee [mailto:[EMAIL PROTECTED]
Sent: Friday, July 07, 2006 11:20 PM
To: Ravinandan Arakali
Cc: Leonid. Grossman (E-mail); [EMAIL PROTECTED];
openib-general@openib.org
Subject: Re: [openib-general] ping problem with ammasso cards(iWARP
interface)
   
   
Ravinandan Arakali wrote:
Pradipta,
Following is the output from gdb after core dump. I have also
  copy-pasted
the gdb output on client system.
   
Attached is the dmesg output when krping test is run in verbose
mode.
The ping data on the sender(client) seems okay. The content is
shifted
forward by one character for each packet. On receiver, after
receiving
ping
pkt 9, it seems to jump to pkt no. 

Re: [openib-general] Suggestions for how to remove bus_to_virt()

2006-07-14 Thread Ralph Campbell
On Fri, 2006-07-14 at 15:35 -0700, David Miller wrote:
 From: Ralph Campbell [EMAIL PROTECTED]
 Date: Fri, 14 Jul 2006 15:27:07 -0700
 
  Note that the current design only supports one IOMMU type in a system.
  This could support multiple IOMMU types at the same time.
 
 This is not true, the framework allows multiply such types
 and in fact Sparc64 takes advantage of this.  We have about
 4 or 5 different PCI controllers, and the IOMMUs are slightly
 different in each.

I see. It looks like dma_map_single() is an inline call to
pci_map_single() which is a function call that can then
look at the device and tell what IOMMU function to call.

 Even with the standard PCI DMA mapping calls, we can gather the
 platform private information necessary to program the IOMMU
 appropriately for a given chipset.
 
 The dma_mapping_ops idea will never get accepted by folks like Linus,
 for reasons I've outlined in previous emails in this thread.  So, it's
 best to look elsewhere for solutions to your problem, such as the
 ideas used by the USB and IEE1394 device layers.

The USB code won't work in my case because the USB system is
the one doing the memory allocation and IOMMU setup so it
can remember the kernel virtual address or physical pages used
to create the mapping.

In my case, the infiniband (SRP) code is doing the mapping and
only passing the dma_addr_t to the device driver at which point
I have no way to convert it back to a kernel virtual address.
I need to either change the IB device API to include mapping functions
or intercept the dma_* functions so I can save the inputs.


___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general



Re: [openib-general] ping problem with ammassocards(iWARPinterface)

2006-07-14 Thread Steve Wise

- Original Message - 
From: Ravinandan Arakali [EMAIL PROTECTED]
To: 'Steve Wise' [EMAIL PROTECTED]
Cc: [EMAIL PROTECTED]; openib-general@openib.org; Leonid. Grossman 
(E-mail) [EMAIL PROTECTED]
Sent: Friday, July 14, 2006 5:36 PM
Subject: RE: [openib-general] ping problem with 
ammassocards(iWARPinterface)


 As Pradipta suggested, I rebuilt the libraries by removing
 the optimization(-O2 flag) from Makefile. Now, I don't see the
 core dump but there's no connection established with rping.
 This is similar to the failure I am seeing with rdma_lat test.

 BTW, when I start the rping in server mode, at say port ,
 should I expect to see an entity listening on that port number
 when I do netstat -an. Currently, I don't see that.


No, netstat doesn't show rdma information




___
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general