Re: [ewg] [PATCH] OFED scripts: Get rid of ifconfig
On Tue, 22 Feb 2011 10:37:02 +0200 Vladimir Sokolovsky v...@dev.mellanox.co.il wrote: On 02/21/2011 06:18 PM, sebastien dugue wrote: As ifconfig is obsolete and cannot cope with IB link layer addresses, replace all instances of ifconfig with the corresponding(s) ip commands in ofed_scripts. Signed-off-by: Sebastien Duguesebastien.du...@bull.net --- Hi Sebastien, ipcalc is not installed on SLES10/SLES11. So, this patch cannot be applied on openibd. Ah OK, I just checked, it does not exists under debian as well. I'll try to think of something else. Thanks, Sébastien. Regards, Vladimir ofed_scripts/ibdev2netdev |2 +- ofed_scripts/openibd | 10 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/ofed_scripts/ibdev2netdev b/ofed_scripts/ibdev2netdev index 7cffa7e..9faa4e3 100755 --- a/ofed_scripts/ibdev2netdev +++ b/ofed_scripts/ibdev2netdev @@ -130,7 +130,7 @@ function find_mac() done } -ifcs=$(ifconfig -a | egrep '^eth|^ib' | gawk '{print $1}') +ifcs=$(/sbin/ip -o link | awk -F : '{print $2}' | egrep '^eth|^ib') for ifc in $ifcs; do len=$(cat /sys/class/net/$ifc/addr_len) diff --git a/ofed_scripts/openibd b/ofed_scripts/openibd index 99b8131..1cd7419 100644 --- a/ofed_scripts/openibd +++ b/ofed_scripts/openibd @@ -762,7 +762,9 @@ bring_up() else . ${NETWORK_CONF_DIR}/ifcfg-${i} if [ ! -z ${IPADDR} ] [ ! -z ${NETMASK} ] [ ! -z ${BROADCAST} ]; then -/sbin/ifconfig ${i} ${IPADDR} netmask ${NETMASK} broadcast ${BROADCAST} /dev/null 21 + eval $(/bin/ipcalc --prefix ${IPADDR} ${NETMASK}) + /sbin/ip addr add ${IPADDR}/${PREFIX} broadcast ${BROADCAST} dev ${i} /dev/null 21 + /sbin/ip link set ${i} up /dev/null 21 else /sbin/ifup ${i} fi @@ -770,12 +772,14 @@ bring_up() ;; SuSE) if [ $KPREFIX == 26 ]; then -ifconfig ${i} up /dev/null 21 + /sbin/ip link set ${i} up /dev/null 21 fi # Workaround for ifup issue: two devices with the same IP address . ${NETWORK_CONF_DIR}/ifcfg-${i} if [ ! -z ${IPADDR} ] [ ! -z ${NETMASK} ] [ ! -z ${BROADCAST} ]; then -/sbin/ifconfig ${i} ${IPADDR} netmask ${NETMASK} broadcast ${BROADCAST} /dev/null 21 + eval $(/bin/ipcalc --prefix ${IPADDR} ${NETMASK}) + /sbin/ip addr add ${IPADDR}/${PREFIX} broadcast ${BROADCAST} dev ${i} /dev/null 21 + /sbin/ip link set ${i} up /dev/null 21 else /sbin/ifup ${i} fi ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
[ewg] [PATCH] OFED scripts: Get rid of ifconfig
As ifconfig is obsolete and cannot cope with IB link layer addresses, replace all instances of ifconfig with the corresponding(s) ip commands in ofed_scripts. Signed-off-by: Sebastien Dugue sebastien.du...@bull.net --- ofed_scripts/ibdev2netdev |2 +- ofed_scripts/openibd | 10 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/ofed_scripts/ibdev2netdev b/ofed_scripts/ibdev2netdev index 7cffa7e..9faa4e3 100755 --- a/ofed_scripts/ibdev2netdev +++ b/ofed_scripts/ibdev2netdev @@ -130,7 +130,7 @@ function find_mac() done } -ifcs=$(ifconfig -a | egrep '^eth|^ib' | gawk '{print $1}') +ifcs=$(/sbin/ip -o link | awk -F : '{print $2}' | egrep '^eth|^ib') for ifc in $ifcs; do len=$(cat /sys/class/net/$ifc/addr_len) diff --git a/ofed_scripts/openibd b/ofed_scripts/openibd index 99b8131..1cd7419 100644 --- a/ofed_scripts/openibd +++ b/ofed_scripts/openibd @@ -762,7 +762,9 @@ bring_up() else . ${NETWORK_CONF_DIR}/ifcfg-${i} if [ ! -z ${IPADDR} ] [ ! -z ${NETMASK} ] [ ! -z ${BROADCAST} ]; then -/sbin/ifconfig ${i} ${IPADDR} netmask ${NETMASK} broadcast ${BROADCAST} /dev/null 21 + eval $(/bin/ipcalc --prefix ${IPADDR} ${NETMASK}) + /sbin/ip addr add ${IPADDR}/${PREFIX} broadcast ${BROADCAST} dev ${i} /dev/null 21 + /sbin/ip link set ${i} up /dev/null 21 else /sbin/ifup ${i} fi @@ -770,12 +772,14 @@ bring_up() ;; SuSE) if [ $KPREFIX == 26 ]; then -ifconfig ${i} up /dev/null 21 + /sbin/ip link set ${i} up /dev/null 21 fi # Workaround for ifup issue: two devices with the same IP address . ${NETWORK_CONF_DIR}/ifcfg-${i} if [ ! -z ${IPADDR} ] [ ! -z ${NETMASK} ] [ ! -z ${BROADCAST} ]; then -/sbin/ifconfig ${i} ${IPADDR} netmask ${NETMASK} broadcast ${BROADCAST} /dev/null 21 + eval $(/bin/ipcalc --prefix ${IPADDR} ${NETMASK}) + /sbin/ip addr add ${IPADDR}/${PREFIX} broadcast ${BROADCAST} dev ${i} /dev/null 21 + /sbin/ip link set ${i} up /dev/null 21 else /sbin/ifup ${i} fi -- 1.7.1 ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
Re: [ewg] [PATCH]mlx4_ib XRC RCV: Fix mlx4_ib_reg_xrc_rcv_qp() locking
On Wed, 16 Feb 2011 14:50:02 +0200 Jack Morgenstein ja...@dev.mellanox.co.il wrote: You are correct! Good catch. We will add this to OFED. Thanks, (P.S., I would rather leave irqsave -- it is used everywhere else for this spinlock). Right, but everywhere you know for sure you're in which context you are (process or interrupt), there's no need to use the save/restore variant. Those are just to be used in places where you don't know in which context you are. Also, one thing I noticed in that same function: why allocate ctx_entry before knowing if it's going to be of any use? The allocation could be done right before the first use. Sébastien. -Jack On Monday 14 February 2011 09:32, sebastien dugue wrote: Resending to the proper ML (sorry). In mlx4_ib_reg_xrc_rcv_qp(), we need to take the xrc_reg_list_lock spinlock when walking the xrc_reg_list. We've been hit by this on 2 customer sites. Also, I guess spin_lock_irqsave() could be replaced by spin_lock_irq() in that function as we know for sure we're in process context. Signed-off-by: Sébastien Dugué sebastien.du...@bull.net -- qp.c |3 +++ 1 file changed, 3 insertions(+) dIndex: kernel-ib/drivers/infiniband/hw/mlx4/qp.c === --- kernel-ib.orig/drivers/infiniband/hw/mlx4/qp.c 2011-01-31 16:52:11.0 +0100 +++ kernel-ib/drivers/infiniband/hw/mlx4/qp.c 2011-02-11 15:24:27.0 +0100 @@ -2549,13 +2549,16 @@ } mutex_lock(mibqp-mutex); + spin_lock_irqsave(mibqp-xrc_reg_list_lock, flags); list_for_each_entry(tmp, mibqp-xrc_reg_list, list) if (tmp-context == context) { + spin_unlock_irqrestore(mibqp-xrc_reg_list_lock, flags); mutex_unlock(mibqp-mutex); kfree(ctx_entry); mutex_unlock(to_mdev(xrcd-device)-xrc_reg_mutex); return 0; } + spin_unlock_irqrestore(mibqp-xrc_reg_list_lock, flags); ctx_entry-context = context; spin_lock_irqsave(mibqp-xrc_reg_list_lock, flags); ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
[ewg] [PATCH]mlx4_ib XRC RCV: Fix mlx4_ib_reg_xrc_rcv_qp() locking
Resending to the proper ML (sorry). In mlx4_ib_reg_xrc_rcv_qp(), we need to take the xrc_reg_list_lock spinlock when walking the xrc_reg_list. We've been hit by this on 2 customer sites. Also, I guess spin_lock_irqsave() could be replaced by spin_lock_irq() in that function as we know for sure we're in process context. Signed-off-by: Sébastien Dugué sebastien.du...@bull.net -- qp.c |3 +++ 1 file changed, 3 insertions(+) dIndex: kernel-ib/drivers/infiniband/hw/mlx4/qp.c === --- kernel-ib.orig/drivers/infiniband/hw/mlx4/qp.c 2011-01-31 16:52:11.0 +0100 +++ kernel-ib/drivers/infiniband/hw/mlx4/qp.c 2011-02-11 15:24:27.0 +0100 @@ -2549,13 +2549,16 @@ } mutex_lock(mibqp-mutex); + spin_lock_irqsave(mibqp-xrc_reg_list_lock, flags); list_for_each_entry(tmp, mibqp-xrc_reg_list, list) if (tmp-context == context) { + spin_unlock_irqrestore(mibqp-xrc_reg_list_lock, flags); mutex_unlock(mibqp-mutex); kfree(ctx_entry); mutex_unlock(to_mdev(xrcd-device)-xrc_reg_mutex); return 0; } + spin_unlock_irqrestore(mibqp-xrc_reg_list_lock, flags); ctx_entry-context = context; spin_lock_irqsave(mibqp-xrc_reg_list_lock, flags); ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
Re: [ewg] GPUDirect Support
Hi Gilad, On Wed, 2 Feb 2011 13:55:46 -0800 Jagga Soorma jagg...@gmail.com wrote: Hey Gilad, Yes, I would definitely need this. Can you please share the code and how to enable this functionality. I second this. as we're running our own kernel we've been unable so far to use GPUDirect because of the lack of sources. Is this GPUDirect stuff to remain proprietary or will it be opensourced? Regards, Sébastien. Thanks, J On Wed, Feb 2, 2011 at 1:54 PM, Gilad Shainer shai...@mellanox.com wrote: It is not yet in the distro. If you need the code, please send me an email. Gilad -- *From*: ewg-boun...@lists.openfabrics.org *To*: ewg@lists.openfabrics.org *Sent*: Wed Feb 02 13:03:24 2011 *Subject*: [ewg] GPUDirect Support Hey Guys, I am trying to enable GPUDirect support in our environment and I have built OFED release 1.5.2. I was expecting the following directory to be created but don't see this directory and the files within that would help me enable gpudirect: /sys/module/ib_core/parameters The parameter file I am looking for is /sys/module/ib_core/parameters/gpu_direct_enable. Any ideas as to why the parameters directory is missing altogether? Anyone on this list have any experience with this. Here is some information about my environment: OS: Red Hat Enterprise Linux Server release 5.4 (Tikanga) kernel: 2.6.18-164.el5 hca info: -- Image type: ConnectX FW Version: 2.7.9100 Rom Info:type=PXE version=3.2.0 devid=26438 proto=VPI Device ID: 26438 Chip Revision: B0 Description: Node Port1Port2Sys image GUIDs: 78e7d10300218884 78e7d10300218885 78e7d10300218886 78e7d10300218887 MACs: 78e7d1218884 78e7d1218885 Board ID: (HP_020003) VSD: PSID:HP_020003 -- Any help would be appreciated. Thanks, -J ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
Re: [ewg] IPoIB to Ethernet routing performance
Hi Matthieu, On Thu, 16 Dec 2010 23:20:35 +0100 matthieu hautreux matthieu.hautr...@gmail.com wrote: The router is fitted with one ConnectX2 QDR HCA and one dual port Myricom 10G Ethernet adapter. ... Here are some numbers: - 1 IPoIB stream between client and router: 20 Gbits/sec Looks OK. - 2 Ethernet streams between router and server: 19.5 Gbits/sec Looks OK. Actually I am amazed you can get such a speed with IPoIB. Trying with NPtcp on my DDR infiniband I can only obtain about 4.6Gbit/sec at the best packet size (that is 1/4 of the infiniband bandwidth) with this chip embedded in the mainboard: InfiniBand: Mellanox Technologies MT25204 [InfiniHost III Lx HCA]; and dual E5430 xeon (not nehalem). That's with 2.6.37 kernel and vanilla ib_ipoib module. What's wrong with my setup? I always assumed that such a slow speed was due to the lack of offloading capabilities you get with ethernet cards, but maybe I was wrong...? Hi, I made the same kind of experimentations than Sebastien and got results similar to those of you Jabe, with about ~4.6Gbit/s. I am using QDR HCA and ipoib in connected mode on the infiniband part of the testbed and 2 * 10Ge ethernet cards in bonding on the ethernet side of the router. To get better results, I had to increase the MTU on the ethernet side from 1500 to 9000. Indeed, due to the TCP Path MTU discovery, during routed exchanges the MTU used on the ipoib link for TCP messages was automatically set to the minimum MTU of 1500. This small but yet very standard MTU value does not seem to be well handled by the ipoib_cm layer. This may be due to the fact that the IB MTU is 2048. Every 1500 bytes packet is padded to 2048 bytes before being sent through the wire, so you're loosing roughly 25% bandwidth compared to an IPoIB MTU which is a multiple of 2048. Is this issue already known and/or reported ? It should be really interesting to understand why a small value of MTU is such a problem for ipoib_cm. After a quick look at the code, it seems that ipoib packet processing is single threaded and that each ip packet is transmitted/received and processed as a single unit. If that appears to be the bottleneck, do you think that packets aggregation and/or processing parallelization could be feasible in a future ipoib module ? A big part of the ethernet networks are configured with an MTU of 1500 and 10Ge cards currently employ parallelization strategy in their kernel module to cope with this problem. It is clear that a bigger MTU is better but it is not always possible to achieve due to existing equipments and machines. IMHO, that is a real problem for infiniband/ethernet interoperability. Sebastien, concerning your bad performance of 9.3Gbit/s when routing 2 streams from you infiniband client to your ethernet server, what is the mode of your bonding on the ethernet side during the test ? are you using balance-rr or LACP ? I did not use any Ethernet teaming, I only declared 2 aliases on the clients' ib0 and set the routing tables accordingly. Sébastien. I got this kind of results with LACP as only one link is really used during the transmissions and this link depends of the layer 2 informations of the peers involved in the communication (as long as you use the default xmit_hash_policy). HTH Regards, Matthieu Also what application did you use for the benchmark? Thank you ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
[ewg] [PATCH] ofed_kernel/makefile: re-add the .PHONY target
Hi Vlad, following the patch: ofed_kernel/makefile: Keep all targets that are to be marked .PHONY in a variable, PHONY the .PHONY target that uses this variable is gone missing. Signed-off-by: Sébastien Dugué sebastien.du...@bull.net --- diff --git a/ofed_scripts/makefile b/ofed_scripts/makefile index 5dc0d94..b10d500 100644 --- a/ofed_scripts/makefile +++ b/ofed_scripts/makefile @@ -8,6 +8,7 @@ PHONY += install_kernel_nfsrdma PHONY += install_kernel_mlx4 all: +.PHONY: $(PHONY) .DELETE_ON_ERROR: ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
[ewg] IPoIB to Ethernet routing performance
Hi, I know this might be off topic, but somebody may have already run into the same problem before. I'm trying to use a server as a router between an IB fabric and an Ethernet network. The router is fitted with one ConnectX2 QDR HCA and one dual port Myricom 10G Ethernet adapter. I did some bandwidth measurements using iperf with the following setup: +-+ +-+ +-+ | | | | 10G Eth | | | |QDR IB | +---+ | | client +---+ Router | 10G Eth | Server | | | | +---+ | | | | | | | +-+ +-+ +-+ However, the routing performance is far from what I would have expected. Here are some numbers: - 1 IPoIB stream between client and router: 20 Gbits/sec Looks OK. - 2 Ethernet streams between router and server: 19.5 Gbits/sec Looks OK. - routing 1 IPoIB stream to 1 Ethernet stream from client to server: 9.8 Gbits/sec We manage to saturate the Ethernet link, looks good so far. - routing 2 IPoIB streams to 2 Ethernet streams from client to server: 9.3 Gbits/sec Argh, even less that when routing a single stream. I would have expected a bit more than this. Has anybody ever tried to do some routing between an IB fabric and an Ethernet network and achieved some sensible bandwidth figures? Are there some known limitations in what I try to achieve? Thanks, Sébastien. ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
Re: [ewg] IPoIB to Ethernet routing performance
On Mon, 6 Dec 2010 10:49:58 - Richard Croucher rich...@informatix-sol.com wrote: You may be able to improve by doing some OS tuning. Right, I tried a few things concerning the TCP/IP stack tuning but nothing really came out of it. All this data should stay in kernel mode but there are lots of bottlenecks in the TCP/IP stack that limit scalability. That may be my problem in fact. The IPoIB code has not been optimized for this use case. I don't think IPoIB to be the bottleneck. In this case as I managed to feed 2 IPoIB streams between the client and the router yielding about 40 Gbits/s bandwidth. You don't mention what Server, kernel and OFED distro you are running. Right, sorry. The router is one of our 4 sockets Nehalem-EX box with 2 IOHs which is running an OFED 1.5.2. The best performance is achieved using InfiniBand/Ethernet hardware gateways. Most of these provide virtual Ethernet NICs to InfiniBand hosts, but the Voltaire 4036E does provide a IPoIB to Ethernet gateway capability. This is FPGA based so does provide much higher performance than you will achieve using a standard server solution. That may be a solution indeed. Are there any real world figures out there concerning the 4036E performance? Thanks Richard, Sébastien. -Original Message- From: ewg-boun...@lists.openfabrics.org [mailto:ewg-boun...@lists.openfabrics.org] On Behalf Of sebastien dugue Sent: 06 December 2010 10:25 To: OF EWG Cc: linux-rdma Subject: [ewg] IPoIB to Ethernet routing performance Hi, I know this might be off topic, but somebody may have already run into the same problem before. I'm trying to use a server as a router between an IB fabric and an Ethernet network. The router is fitted with one ConnectX2 QDR HCA and one dual port Myricom 10G Ethernet adapter. I did some bandwidth measurements using iperf with the following setup: +-+ +-+ +-+ | | | | 10G Eth | | | |QDR IB | +---+ | | client +---+ Router | 10G Eth | Server | | | | +---+ | | | | | | | +-+ +-+ +-+ However, the routing performance is far from what I would have expected. Here are some numbers: - 1 IPoIB stream between client and router: 20 Gbits/sec Looks OK. - 2 Ethernet streams between router and server: 19.5 Gbits/sec Looks OK. - routing 1 IPoIB stream to 1 Ethernet stream from client to server: 9.8 Gbits/sec We manage to saturate the Ethernet link, looks good so far. - routing 2 IPoIB streams to 2 Ethernet streams from client to server: 9.3 Gbits/sec Argh, even less that when routing a single stream. I would have expected a bit more than this. Has anybody ever tried to do some routing between an IB fabric and an Ethernet network and achieved some sensible bandwidth figures? Are there some known limitations in what I try to achieve? Thanks, Sébastien. ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
Re: [ewg] IPoIB to Ethernet routing performance
On Mon, 6 Dec 2010 12:08:43 - Richard Croucher rich...@informatix-sol.com wrote: Unfortunately, the 4036E only has two 10G Ethernet ports which will ultimately limit the throughput. I'll need to look into this option. The Mellanox BridgeX looks a better hardware solution with 12x 10Ge ports but when I tested this they could only provide vNIC functionality and would not commit to adding IPoIB gateway on their roadmap. Right, we did some evaluation on it and this was really a show stopper. Thanks, Sébastien. Qlogic also offer the 12400 Gateway. This has 6x 10ge ports. However, like the Mellanox, I understand they only provide host vNIC support. I'll leave it to representatives from Voltaire, Mellanox and Qlogic to update us. Particularly on support for InfiniBand to Ethernet Gateway for RoCEE. This is needed so that RDMA sessions can be run between InfiniBand and RoCEE connected hosts. I don't believe this will work over any of the today's available products. Richard -Original Message- From: sebastien dugue [mailto:sebastien.du...@bull.net] Sent: 06 December 2010 11:40 To: Richard Croucher Cc: 'OF EWG'; 'linux-rdma' Subject: Re: [ewg] IPoIB to Ethernet routing performance On Mon, 6 Dec 2010 10:49:58 - Richard Croucher rich...@informatix-sol.com wrote: You may be able to improve by doing some OS tuning. Right, I tried a few things concerning the TCP/IP stack tuning but nothing really came out of it. All this data should stay in kernel mode but there are lots of bottlenecks in the TCP/IP stack that limit scalability. That may be my problem in fact. The IPoIB code has not been optimized for this use case. I don't think IPoIB to be the bottleneck. In this case as I managed to feed 2 IPoIB streams between the client and the router yielding about 40 Gbits/s bandwidth. You don't mention what Server, kernel and OFED distro you are running. Right, sorry. The router is one of our 4 sockets Nehalem-EX box with 2 IOHs which is running an OFED 1.5.2. The best performance is achieved using InfiniBand/Ethernet hardware gateways. Most of these provide virtual Ethernet NICs to InfiniBand hosts, but the Voltaire 4036E does provide a IPoIB to Ethernet gateway capability. This is FPGA based so does provide much higher performance than you will achieve using a standard server solution. That may be a solution indeed. Are there any real world figures out there concerning the 4036E performance? Thanks Richard, Sébastien. -Original Message- From: ewg-boun...@lists.openfabrics.org [mailto:ewg-boun...@lists.openfabrics.org] On Behalf Of sebastien dugue Sent: 06 December 2010 10:25 To: OF EWG Cc: linux-rdma Subject: [ewg] IPoIB to Ethernet routing performance Hi, I know this might be off topic, but somebody may have already run into the same problem before. I'm trying to use a server as a router between an IB fabric and an Ethernet network. The router is fitted with one ConnectX2 QDR HCA and one dual port Myricom 10G Ethernet adapter. I did some bandwidth measurements using iperf with the following setup: +-+ +-+ +-+ | | | | 10G Eth | | | |QDR IB | +---+ | | client +---+ Router | 10G Eth | Server | | | | +---+ | | | | | | | +-+ +-+ +-+ However, the routing performance is far from what I would have expected. Here are some numbers: - 1 IPoIB stream between client and router: 20 Gbits/sec Looks OK. - 2 Ethernet streams between router and server: 19.5 Gbits/sec Looks OK. - routing 1 IPoIB stream to 1 Ethernet stream from client to server: 9.8 Gbits/sec We manage to saturate the Ethernet link, looks good so far. - routing 2 IPoIB streams to 2 Ethernet streams from client to server: 9.3 Gbits/sec Argh, even less that when routing a single stream. I would have expected a bit more than this. Has anybody ever tried to do some routing between an IB fabric and an Ethernet network and achieved some sensible bandwidth figures? Are there some known limitations in what I try to achieve? Thanks, Sébastien. ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
Re: [ewg] IPoIB to Ethernet routing performance
Hi Jabe, On Mon, 06 Dec 2010 21:47:42 +0100 Jabe jabe.chap...@shiftmail.org wrote: The router is fitted with one ConnectX2 QDR HCA and one dual port Myricom 10G Ethernet adapter. ... Here are some numbers: - 1 IPoIB stream between client and router: 20 Gbits/sec Looks OK. - 2 Ethernet streams between router and server: 19.5 Gbits/sec Looks OK. Actually I am amazed you can get such a speed with IPoIB. Trying with NPtcp on my DDR infiniband I can only obtain about 4.6Gbit/sec at the best packet size (that is 1/4 of the infiniband bandwidth) with this chip embedded in the mainboard: InfiniBand: Mellanox Technologies MT25204 [InfiniHost III Lx HCA]; and dual E5430 xeon (not nehalem). That's with 2.6.37 kernel and vanilla ib_ipoib module. What's wrong with my setup? I always assumed that such a slow speed was due to the lack of offloading capabilities you get with ethernet cards, but maybe I was wrong...? Also what application did you use for the benchmark? I'm using iperf. Sébastien. Thank you ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
Re: [ewg] IPoIB to Ethernet routing performance
Hi Jason, On Mon, 6 Dec 2010 14:27:59 -0700 Jason Gunthorpe jguntho...@obsidianresearch.com wrote: On Mon, Dec 06, 2010 at 09:47:42PM +0100, Jabe wrote: Technologies MT25204 [InfiniHost III Lx HCA]; and dual E5430 xeon (not nehalem). Newer Mellanox cards have most of the offloads you see for ethernet so they get better performance. What kind of offload capabilities are you referring to for IPoIB? Plus Nehalem is just better at TCP in the first place.. Well that depends on which Nehalem we're talking about. I've found that the EX performs more poorly than the EP, though I didn't dig enough to find out why. Sébastien. ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
[ewg] What happened to the qperf and srptools git trees?
Hi, are qperf and srptools still maintained? If so, it looks like those git trees never made it to the new openfabrics server. Sébastien. ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
Re: [ewg] Has anyone tried the ISC DHCP patches for Infiniband with ISC DHCP 4.1.x?
On Fri, 14 May 2010 00:31:36 +1000 Justin Clift jus...@salasaga.org wrote: On 05/12/2010 01:25 AM, sebastien dugue wrote: Hi Justin, I've been trying to do just that for the past few days. Right now I managed to have the client working over IB, but it's a gross hack which will need to be cleaned up. Tomorrow I'll check with the server part and will keep you and the list posted. Hey Sebastien, How's this going? :) Hi Justin, I think it's going all right. I finally managed to get a working ISC DHCP over Infiniband prototype. It sustained limited testing overnight, I plan to test on one of our clusters today. I will post the patches soon (those are over the latest Fedora 12 DHCP 4.1.1-13 package). Sebastien. Regards and best wishes, Justin Clift Sebastien. ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
[ewg] [PATCH 0/2] Add infiniband support to ISC DHCP 4.1.1
Hi Justin, all, here are the 2 patches that I have over Fedora 12 dhcp-4.1.1-13 adding support for DHCP over IB: - the 1st patch adds all the IB support. - the 2nd patch improves on the xid used over IB (and has been contributed by Matthieu Hautreux from CEA). NOTE: For the dhcp-client-identifier needed for IB I chose to have the port GUID prefixed with the pkey of the interface as follows: ff:ff:08:00:38:00:01:37:A8:B1 | | | +-+---+ pkey port GUID This allows to have QoS for DHCP for free. This is just a proof of concept, but has been running satisfactorily. Feel free to comment... Sebastien. ___ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg
[ewg] [PATCH 2/2] Improve XID generation
From: Matthieu Hautreux matthieu.hautr...@cea.fr One major drawback of DHCP over IB is the requirement to use broadcasted replies. The only way for a IB dhclient to find its own reply is by looking for matching xid (transaction ID). xid are generated using random() in dhclient. random() is initialized using srandom(seed+cur_time). Nethertheless, when hw address hlen is 1, the processed seed is roughly always the same. As a result, IB nodes that uses dhclient to configure IB interfaces at the same time share the same xid and as a result use any of the broadcasted replies. The protocol is then broken. The proposed patch build a backup seed using all the interfaces available on the machine and use it if it can not find a good seed for the required interfaces during real DHCP process. If backup seed construction fails (no interfaces providing a hw address long enough for seed construction), it try to build a seed using gethostid(). Signed-off-by: Sebastien Dugue sebastien.du...@bull.net dhclient.c | 64 ++--- 1 file changed, 53 insertions(+), 11 deletions(-) Index: dhcp-4.1.0p1/client/dhclient.c === --- dhcp-4.1.0p1.orig/client/dhclient.c 2010-05-17 16:03:33.0 +0200 +++ dhcp-4.1.0p1/client/dhclient.c 2010-05-17 16:04:08.0 +0200 @@ -906,6 +906,26 @@ } } + /* We create a backup seed before rediscovering interfaces in order to + have a seed built using all of the available interfaces + It's interesting if required interfaces doesn't let us defined + a really unique seed due to a lack of valid HW addr later + (this is the case with DHCP over IB) + We only use the last device as using a sum could broke the + uniqueness of the seed among multiple nodes +*/ + unsigned backup_seed = 0; + for (ip = interfaces; ip; ip = ip - next) { + int junk; + if ( ip - hw_address.hlen = sizeof seed ) + continue; + memcpy (junk, + ip - hw_address.hbuf [ip - hw_address.hlen - + sizeof seed], sizeof seed); + backup_seed = junk; + } + + /* At this point, all the interfaces that the script thinks are relevant should be running, so now we once again call discover_interfaces(), and this time ask it to actually set @@ -920,14 +940,36 @@ Not much entropy, but we're booting, so we're not likely to find anything better. */ seed = 0; + int seed_flag = 0; for (ip = interfaces; ip; ip = ip-next) { int junk; + if ( ip - hw_address.hlen = sizeof seed ) + continue; memcpy(junk, ip-hw_address.hbuf[ip-hw_address.hlen - sizeof seed], sizeof seed); seed += junk; + seed_flag = 1; } - srandom(seed + cur_time); + if ( seed_flag == 0 ) { + if ( backup_seed != 0 ) { + seed = backup_seed; + log_info (xid: rand init seed (%u) built using all +available interfaces,seed); + } + else { + seed = cur_time^((unsigned) gethostid()) ; + log_info (xid: warning: no netdev with useable HWADDR found +for seed's uniqueness enforcement); + log_info (xid: rand init seed (%u) built using gethostid, + seed); + } + /* we only use seed and no current time as a broadcast reply */ + /* will certainly be used by the hwaddrless interface */ + srandom(seed); + } + else + srandom(seed + cur_time); /* Setup specific Infiniband options */ for (ip = interfaces; ip; ip = ip-next) { @@ -1423,7 +1465,7 @@ return; } - log_info (DHCPACK from %s, piaddr (packet - client_addr)); + log_info (DHCPACK from %s (xid=%u), piaddr (packet - client_addr), client - xid); lease = packet_to_lease (packet, client); if (!lease) { @@ -2123,7 +2165,7 @@ return; } - log_info (DHCPNAK from %s, piaddr (packet - client_addr)); + log_info (DHCPNAK from %s (xid=%u), piaddr (packet - client_addr), client - xid); if (!client - active) { #if defined (DEBUG) @@ -2249,10 +2291,10 @@ client - packet.secs = htons (65535); client - secs = client - packet.secs; - log_info (DHCPDISCOVER on %s to %s port %d interval %ld, + log_info (DHCPDISCOVER on %s to %s port %d interval %ld (xid=%u), client - name ? client - name
[ewg] [PATCH 1/2] ISC DHCP 4.1.1 IB support
Infiniband support for ISC DHCP This patch adds Infiniband support to DHCP using the Linux Packet Filter interface. Concerning the dhcp-client-identifier, I have chosen to diverge from current IB practice by setting it to the pkey followed by the port GUID: ff:ff:08:00:38:00:01:37:A8:B1 || | ++---+ pkey port GUID This allows having DHCP working over OpenSM's QoS. Signed-off-by: Sebastien Dugue sebastien.du...@bull.net client/dhclient.c | 71 common/bpf.c | 33 +++ common/lpf.c | 239 -- common/socket.c |4 includes/dhcp.h |1 includes/dhcpd.h |3 6 files changed, 308 insertions(+), 43 deletions(-) Index: dhcp-4.1.0p1/common/lpf.c === --- dhcp-4.1.0p1.orig/common/lpf.c 2010-05-17 16:03:17.0 +0200 +++ dhcp-4.1.0p1/common/lpf.c 2010-05-17 16:03:33.0 +0200 @@ -42,6 +42,7 @@ #include includes/netinet/udp.h #include includes/netinet/if_ether.h #include net/if.h +#include ifaddrs.h #ifndef PACKET_AUXDATA #define PACKET_AUXDATA 8 @@ -59,6 +60,15 @@ /* Reinitializes the specified interface after an address change. This is not required for packet-filter APIs. */ +/* Default broadcast address for IPoIB */ +unsigned char default_ib_bcast_addr[20] = { + 0x00, 0xff, 0xff, 0xff, + 0xff, 0x12, 0x40, 0x1b, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0xff, 0xff, 0xff, 0xff +}; + #ifdef USE_LPF_SEND void if_reinitialize_send (info) struct interface_info *info; @@ -86,10 +96,21 @@ struct sockaddr common; } sa; struct ifreq ifr; + int type; + int protocol; /* Make an LPF socket. */ - if ((sock = socket(PF_PACKET, SOCK_RAW, - htons((short)ETH_P_ALL))) 0) { + get_hw_addr(info); + + if (info-hw_address.hbuf[0] == HTYPE_INFINIBAND) { + type = SOCK_DGRAM; + protocol = ETHERTYPE_IP; + } else { + type = SOCK_RAW; + protocol = ETH_P_ALL; + } + + if ((sock = socket(PF_PACKET, type, htons((short)protocol))) 0) { if (errno == ENOPROTOOPT || errno == EPROTONOSUPPORT || errno == ESOCKTNOSUPPORT || errno == EPFNOSUPPORT || errno == EAFNOSUPPORT || errno == EINVAL) { @@ -111,6 +132,7 @@ /* Bind to the interface name */ memset (sa, 0, sizeof sa); sa.ll.sll_family = AF_PACKET; + sa.ll.sll_protocol = htons(protocol); sa.ll.sll_ifindex = ifr.ifr_ifindex; if (bind (sock, sa.common, sizeof sa)) { if (errno == ENOPROTOOPT || errno == EPROTONOSUPPORT || @@ -126,8 +148,6 @@ log_fatal (Bind socket to interface: %m); } - get_hw_addr(info-name, info-hw_address); - return sock; } #endif /* USE_LPF_SEND || USE_LPF_RECEIVE */ @@ -182,6 +202,8 @@ in bpf includes... */ extern struct sock_filter dhcp_bpf_filter []; extern int dhcp_bpf_filter_len; +extern struct sock_filter dhcp_ib_bpf_filter []; +extern int dhcp_ib_bpf_filter_len; #if defined (HAVE_TR_SUPPORT) extern struct sock_filter dhcp_bpf_tr_filter []; @@ -199,11 +221,13 @@ /* Open a LPF device and hang it on this interface... */ info - rfdesc = if_register_lpf (info); - val = 1; - if (setsockopt (info - rfdesc, SOL_PACKET, PACKET_AUXDATA, val, - sizeof val) 0) { - if (errno != ENOPROTOOPT) - log_fatal (Failed to set auxiliary packet data: %m); + if (info-hw_address.hbuf[0] != HTYPE_INFINIBAND) { + val = 1; + if (setsockopt (info - rfdesc, SOL_PACKET, PACKET_AUXDATA, + val, sizeof val) 0) { + if (errno != ENOPROTOOPT) + log_fatal (Failed to set auxiliary packet data: %m); + } } #if defined (HAVE_TR_SUPPORT) @@ -249,15 +273,28 @@ memset(p, 0, sizeof(p)); - /* Set up the bpf filter program structure.This is defined in - bpf.c */ - p.len = dhcp_bpf_filter_len; - p.filter = dhcp_bpf_filter; - -/* Patch the server port into the LPF program... - XXX changes to filter program may require changes - to the insn number(s) used below! XXX */ - dhcp_bpf_filter [8].k = ntohs ((short)local_port); + if (info-hw_address.hbuf[0] == HTYPE_INFINIBAND) { + /* Set up the bpf filter program structure. */ + p.len = dhcp_ib_bpf_filter_len; + p.filter = dhcp_ib_bpf_filter; + + /* Patch the server port into the LPF program... + XXX
[ewg] libibnetdisc: Add grouping for Voltaire's ISR4700 switch
The ISR4700 features 3 kind of boards: - sLB-4018 line board with a single 36 port asic - sFB-4700 fabric board with a single 36 port asic - sFB-4700X2 double density fabric board with 2 36 port asics The double density fabric board (sFB-4700X2) features external 12X connectors that are only an aggregation of 3 4X ports, therefore ext_portnum is set to match the number printed on the faceplate. Signed-off-by: Sebastien Dugue sebastien.du...@bull.net diff --git a/infiniband-diags/libibnetdisc/include/infiniband/ibnetdisc.h b/infiniband-diags/libibnetdisc/include/infiniband/ibnetdisc.h index 136282c..2735224 100644 --- a/infiniband-diags/libibnetdisc/include/infiniband/ibnetdisc.h +++ b/infiniband-diags/libibnetdisc/include/infiniband/ibnetdisc.h @@ -121,7 +121,7 @@ typedef struct ibnd_chassis { ibnd_node_t *nodes; /* specific to voltaire type nodes */ -#define SPINES_MAX_NUM 12 +#define SPINES_MAX_NUM 18 #define LINES_MAX_NUM 36 ibnd_node_t *spinenode[SPINES_MAX_NUM + 1]; ibnd_node_t *linenode[LINES_MAX_NUM + 1]; diff --git a/infiniband-diags/libibnetdisc/src/chassis.c b/infiniband-diags/libibnetdisc/src/chassis.c index 80e034b..cd2113f 100644 --- a/infiniband-diags/libibnetdisc/src/chassis.c +++ b/infiniband-diags/libibnetdisc/src/chassis.c @@ -49,8 +49,8 @@ #include internal.h #include chassis.h -static char *ChassisTypeStr[5] = -{ , ISR9288, ISR9096, ISR2012, ISR2004 }; +static char *ChassisTypeStr[6] = +{ , ISR9288, ISR9096, ISR2012, ISR2004, ISR4700 }; static char *ChassisSlotTypeStr[4] = { , Line, Spine, SRBD }; typedef struct chassis_scan { @@ -71,7 +71,7 @@ char *ibnd_get_chassis_type(ibnd_node_t * node) return NULL; if (!node-chassis) return NULL; - if (node-ch_type == UNRESOLVED_CT || node-ch_type ISR2004_CT) + if (node-ch_type == UNRESOLVED_CT || node-ch_type ISR4700_CT) return NULL; return ChassisTypeStr[node-ch_type]; } @@ -273,10 +273,23 @@ static int is_spine_2012(ibnd_node_t * n) return (devid == VTR_DEVID_SFB2012); } +static int is_spine_4700(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n-info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SFB4700); +} + +static int is_spine_4700x2(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n-info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SFB4700X2); +} + static int is_spine(ibnd_node_t * n) { return (is_spine_9096(n) || is_spine_9288(n) || - is_spine_2004(n) || is_spine_2012(n)); + is_spine_2004(n) || is_spine_2012(n) || + is_spine_4700(n) || is_spine_4700x2(n)); } static int is_line_24(ibnd_node_t * n) @@ -298,9 +311,16 @@ static int is_line_2024(ibnd_node_t * n) return (devid == VTR_DEVID_SLB2024); } +static int is_line_4700(ibnd_node_t * n) +{ + uint32_t devid = mad_get_field(n-info, 0, IB_NODE_DEVID_F); + return (devid == VTR_DEVID_SLB4018); +} + static int is_line(ibnd_node_t * n) { - return (is_line_24(n) || is_line_8(n) || is_line_2024(n)); + return (is_line_24(n) || is_line_8(n) || + is_line_2024(n) || is_line_4700(n)); } int is_chassis_switch(ibnd_node_t * n) @@ -309,52 +329,100 @@ int is_chassis_switch(ibnd_node_t * n) } /* these structs help find Line (Anafa) slot number while using spine portnum */ -char line_slot_2_sfb4[25] = { - 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, - 4 +char line_slot_2_sfb4[37] = { + 0, + 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +char anafa_line_slot_2_sfb4[37] = { + 0, + 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, + 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -char anafa_line_slot_2_sfb4[25] = { - 0, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, - 2 +char line_slot_2_sfb12[37] = { + 0, + 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, + 10, 10, 11, 11, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +char anafa_line_slot_2_sfb12[37] = { + 0, + 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, + 1, 2, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -char line_slot_2_sfb12[25] = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, - 12, 12 +/* LB slot = table[spine port] */ +char line_slot_2_sfb18[37] = { + 0, + 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, + 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18}; +/* LB asic num = table[spine port] */ +char anafa_line_slot_2_sfb18[37] = { + 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; -char anafa_line_slot_2_sfb12