Re: [PATCH 02/25] IB/mthca, net/mlx4: remove counting semaphores

2015-10-29 Thread Jack Morgenstein
On Wed, 28 Oct 2015 03:45:50 +0100
Arnd Bergmann  wrote:

> As far as I can tell, there is a preexisting race condition
> regarding the cmd->use_events flag, which is not protected
> by any lock. When this flag is toggled while another command
> is being started, that command gets stuck until the mode is
> toggled back.

We fixed this issue in mellanox ofed in a manner that allowed keeping
the same counting mechanism.  IMHO, this is preferable, rather than
totally changing the mechanism.

We will submit a similar patch to the upstream kernel shortly.

-Jack

net/mlx4: Switching between sending commands via polling and events may results 
in hung tasks

When switching between those methonds of sending commands, it's possbile that a
task will keep waiting for the polling sempahore, but may never be able to 
acquire it.
This is due to mlx4_cmd_use_events which "down"s the sempahore back to 0.

Reproducing it involves in sending commands while chaning between 
mlx4_cmd_use_polling
and mlx4_cmd_use_events.

Solving it by adding a read-write semaphore when switching between modes.

issue: 402565
Change-Id: I19f0d40dbb327c49b39a9abbcb2bb002b0279b0b
Signed-off-by: Matan Barak 
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c  | 23 +--
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |  2 ++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index def1338..f94a960 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -766,17 +766,23 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 
*out_param,
return mlx4_cmd_reset_flow(dev, op, op_modifier, -EIO);
 
if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
+   int ret;
+
if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
return mlx4_internal_err_ret_value(dev, op,
  op_modifier);
+   down_read(&mlx4_priv(dev)->cmd.switch_sem);
if (mlx4_priv(dev)->cmd.use_events)
-   return mlx4_cmd_wait(dev, in_param, out_param,
-out_is_imm, in_modifier,
-op_modifier, op, timeout);
+   ret = mlx4_cmd_wait(dev, in_param, out_param,
+   out_is_imm, in_modifier,
+   op_modifier, op, timeout);
else
-   return mlx4_cmd_poll(dev, in_param, out_param,
-out_is_imm, in_modifier,
-op_modifier, op, timeout);
+   ret = mlx4_cmd_poll(dev, in_param, out_param,
+   out_is_imm, in_modifier,
+   op_modifier, op, timeout);
+
+   up_read(&mlx4_priv(dev)->cmd.switch_sem);
+   return ret;
}
return mlx4_slave_cmd(dev, in_param, out_param, out_is_imm,
  in_modifier, op_modifier, op, timeout);
@@ -2437,6 +2443,7 @@ int mlx4_cmd_init(struct mlx4_dev *dev)
int flags = 0;
 
if (!priv->cmd.initialized) {
+   init_rwsem(&priv->cmd.switch_sem);
mutex_init(&priv->cmd.slave_cmd_mutex);
sema_init(&priv->cmd.poll_sem, 1);
priv->cmd.use_events = 0;
@@ -2566,6 +2573,7 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev)
if (!priv->cmd.context)
return -ENOMEM;
 
+   down_write(&priv->cmd.switch_sem);
for (i = 0; i < priv->cmd.max_cmds; ++i) {
priv->cmd.context[i].token = i;
priv->cmd.context[i].next  = i + 1;
@@ -2590,6 +2598,7 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev)
 
down(&priv->cmd.poll_sem);
priv->cmd.use_events = 1;
+   up_write(&priv->cmd.switch_sem);
 
return err;
 }
@@ -2602,6 +2611,7 @@ void mlx4_cmd_use_polling(struct mlx4_dev *dev)
struct mlx4_priv *priv = mlx4_priv(dev);
int i;
 
+   down_write(&priv->cmd.switch_sem);
priv->cmd.use_events = 0;
 
for (i = 0; i < priv->cmd.max_cmds; ++i)
@@ -2610,6 +2620,7 @@ void mlx4_cmd_use_polling(struct mlx4_dev *dev)
kfree(priv->cmd.context);
 
up(&priv->cmd.poll_sem);
+   up_write(&priv->cmd.switch_sem);
 }
 
 struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h 
b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 6c58021..2f03e6e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -45,6 +45,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -626,6 +627,7 @@ struct m

Re: [PATCH] infiniband/mlx4: check for mapping error

2015-03-17 Thread Jack Morgenstein
On Mon, 16 Mar 2015 18:49:59 +0100 (CET)
Sebastian Ott  wrote:

> From: Sebastian Ott 
> To: linux-rdma@vger.kernel.org, linux-ker...@vger.kernel.org
> cc: Roland Dreier , Sean Hefty
> , Hal Rosenstock , Or
> Gerlitz , "David S. Miller"
> , Yishai Hadas , Ira Weiny
> , Jack Morgenstein ,
> Matan Barak , Moni Shoua ,
> Jiri Kosina  Subject: [PATCH] infiniband/mlx4: check
> for mapping error Date: Mon, 16 Mar 2015 18:49:59 +0100 (CET)
> User-Agent: Alpine 2.11 (LFD 23 2013-08-11) Organization: "IBM
> Deutschland Research & Development GmbH / Vorsitzende des
> Aufsichtsrats: Martina Koederitz Geschäftsführung: Dirk Wittkopp Sitz
> der Gesellschaft: Böblingen / Registergericht: Amtsgericht Stuttgart,
> HRB 243294"
> 
> 
> Since ib_dma_map_single can fail use ib_dma_mapping_error to check
> for errors.
> 
> Signed-off-by: Sebastian Ott 

Acked-by: Jack Morgenstein 

> ---

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: mlx4: having trouble getting mlx4_NOP to succeed in the VF driver

2014-12-31 Thread Jack Morgenstein
On Wed, 31 Dec 2014 02:26:07 +0530
Bob Biloxi  wrote:

> Hi,
> 
> I was going through the mlx4 source code and had a few questions
> regarding the generation of interrupts upon execution of the NOP
> command from the VF driver.
> 
> If i am running as a dedicated driver, then NOP seems to work fine(I
> get an interrupt)
> 
> But if I enable SRIOV and then from the VF driver, i run the NOP
> command, I don't receive any interrupt(on the VF side)
> 
> err = mlx4_NOP(dev); //this command when executed from VF driver
> doesn't raise any interrupt.
> 
> I get the following from VF logs:
> 
> [  117.879100] mlx4_core :01:00.0: communication channel command
> 0x5 timed out
> [  117.879120] mlx4_core :01:00.0: failed execution of VHCR_POST
> commandopcode 0x31
> [  117.879127] mlx4_core :01:00.0: NOP command failed to generate
> MSI-X interrupt IRQ 24).
> 

This simply indicates that the VF did not receive a command-completion
interrupt.

> 
> I have checked the logs and it seems from the VHCR, NOP is received
> properly on the PF side and the HCR command is successful.
> 
> Also GEN_EQE HCR command when executed in response to NOP is also
> successful.( i can see the return status of the command execution)
> 
> 
What is your setup topology?  Is the VF running on the Hypervisor?  Is
it running on a VM?

What is your O/S (Ubuntu X.Y, Fedora, SLES, etc).  What kernel are you
running?

I assume that you are running "inbox" under kernel 3.18.1.  Is this
correct?

> 
> But on the VF side, the mlx4_eq_int function doesn't get called.
>
This is because GEN_EQE did not succeed in triggering the EQ which the
VF uses for Async/command-completion events.
 
> I have checked the return value of request_irq and it seems to be
> 0(no error)
> 
> mlx4_enable_msi_x is also successful.
> 
> 
> Can anyone please help me if I am missing something?
> Is there anything to be done so as to get interrupts in the mlx4 VF
> driver?
> 
> Can i check at any logs? dmesg output is the only place i was
> checking.
> 
> 
> 
> Also, can the ConnectX hardware generate interrupt to the VF driver?
ConnectX does generate *send/receive* completion events directly to the
VF. This is because each CQ is associated individually with an EQ, and
the VF associates CQs it creates with its own EQs.

Each VF also creates an Async/command-completion EQ.  However, this EQ
is triggered by the PF via GEN_EQ (see explanation immediately below).

The issue here is that only the PF posts commands to the FW -- and
receives the command-completion event when a command completes.
The VF submits to the PF a command it wishes to post.  The PF posts
the command to the firmware (i.e., the HCA), and fields the
command-completion event.  It then invokes GEN_EQ to trigger the command
completion event on the VF's async EQ.

You need to verify that the IOMMU options are activated in
make menuconfig on the Hypervisor:

--- IOMMU Hardware Support
[*]   AMD IOMMU support
[*] Export AMD IOMMU statistics to debugfs
<> AMD IOMMU Version 2 driver
[*]   Support for Intel IOMMU using DMA Remapping Devices
[*] Enable Intel DMA Remapping Devices by default
[*]   Support for Interrupt Remapping

I suspect that this may not have been done.
also, add intel_iommu=on to the kernel line in /boot/grub/menu.lst

-Jack

> Or is it that it only generates to the PF driver and PF driver uses
> GEN_EQE? I understand that GEN_EQE is used to generate an event
> towards a VF..But how are the interrupts routed to the VF driver?
> 
> 
> I would be really very much grateful if I can get any kind of help.
> 
> 
> Thanks so much !!
> 
> 
> Best Regards,
> Bob
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma"
> in the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Query regarding MAD_DEMUX and Secure Host

2014-12-16 Thread Jack Morgenstein
On Mon, 15 Dec 2014 15:07:58 +0530
Bob Biloxi  wrote:

> am I correct in my understanding
> when i say that MAD_DEMUX feature is not required to be
> supported/implemented in Mellanox RoCE Drivers?
> 
> It is required only for Infiniband drivers?

Actually, you will need to support MAD_DEMUX anyway. If not, the
CONF_SPECIAL_QP command will fail if Secure Host mode is operating.

CONF_SPECIAL_QP is required for RoCE as well, since if it is not called
we will not have QP1.  However, since this command maps QP0 as well to
a QP, the MAD_DEMUX command is still required.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: FMR Support in multi-function environment

2014-11-11 Thread Jack Morgenstein
On Mon, 10 Nov 2014 19:58:46 +0530
Bob Biloxi  wrote:

> Hi,
> 
> Is FMR (Fast Memory Regions) supported in a multi-function mode?

In SRIOV, FMR is supported only for the PF, not for VFs (since this
feature requires writing directly to mapped ICM memory).

You can see this in file drivers/infiniband/hw/mlx4/main.c, function
mlx4_ib_add() :


if (!mlx4_is_slave(ibdev->dev)) {
ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc;
ibdev->ib_dev.map_phys_fmr  = mlx4_ib_map_phys_fmr;
ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr;
ibdev->ib_dev.dealloc_fmr   = mlx4_ib_fmr_dealloc;
}

i.e., the fmr functions are not put into the device virtual function
table for slave (= VF) devices.

-Jack

> 
> If yes, I couldn't find the source code for the same in the mlx4
> codebase. Can anyone please point me to the right location...
> 
> What I was trying to understand is this:
> 
> Suppose a VF driver wants to register large amount of memory using
> FMR, will it be able to do so using the mlx4 code.
> 
> Or FMR is supported only in dedicated mode?
> 
> 
> Thanks
> 
> Best Regards,
> Bob
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma"
> in the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: mlx4: RoCE support in SRIOV environment

2014-11-10 Thread Jack Morgenstein
On Mon, 10 Nov 2014 20:05:54 +0530
Bob Biloxi  wrote:

> Hi,
> 
> I was going through the mlx4 code and previous mailing lists when I
> came across the following thread:
> 
> 
> http://marc.info/?l=linux-rdma&m=134398354428293&w=2
> 
> 
> In that thread, it is mentioned as follows:
> 
> Some Limitations
> 
> 1. FMRs are not currently supported on slaves. This will be corrected
> in a future submission.
> 2. RoCE is not currently supported on slaves. This will be corrected
> in a future submission.
> 

Hi, RoCE support in SRIOV was added in upstream kernel 3.14 :

commit 39e7d095f9d0a82a78804650917cd57972a480ce
Merge: 36f6fdb aa9a2d5
Author: David S. Miller 
Date:   Wed Mar 12 15:57:26 2014 -0400

Merge branch 'mlx4-next'

Or Gerlitz says:


mlx4: Add SRIOV support for RoCE

This series adds SRIOV support for RoCE (RDMA over Ethernet) to the mlx4 
driver.

The patches are against net-next, as of commit 2d8d40a "pkt_sched: fq:
do not hold qdisc lock while allocating memory"

changes from V1:
 - addressed feedback from Dave on patch #3 and changed 
get_real_sgid_index()
   to be called fill_in_real_sgid_index() and be a void  function.
 - removed some checkpatch warnings on long lines

changes from V0:
  - always check the return code of mlx4_get_roce_gid_from_slave().
The call we fixed is introduced in patch #1 and later removed by
patch #3 that allows guests to have multiple GIDS. The 1..3
separation was done for proper division of patches to logical changes.


Signed-off-by: David S. Miller 


-Jack

> 
> As the thread dats back to 2012, I wanted to confirm if FMR & RoCE are
> still not supported in SRIOV environment(master & slave)?
> 
> 
> Thanks so much in advance!!
> 
> Best Regards,
> Bob
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma"
> in the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/core: Fix race condition in ib_uverbs_open_qp

2014-10-02 Thread Jack Morgenstein
On Tue, 23 Sep 2014 15:55:14 +
"Hefty, Sean"  wrote:

> > --- a/drivers/infiniband/core/uverbs_main.c
> > +++ b/drivers/infiniband/core/uverbs_main.c
> > @@ -502,6 +502,10 @@ void ib_uverbs_qp_event_handler(struct ib_event
> > *event, void *context_ptr)
> >  {
> > struct ib_uevent_object *uobj;
> > 
> > +   /* for XRC target qp's, check that qp is live */
> > +   if (!event->element.qp->uobject
> > || !event->element.qp->uobject->live)
> > +   return;
> 
> Isn't checking 'live' sufficient?

Unfortunately, not -- uobject might be NULL, in which case we will get
a kernel Oops.

However, checking that uobject is NULL alone is not sufficient -- it
might not yet be live.

-Jack


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: mlx4: using dma_sync_single_for_cpu/dma_sync_single_for_cpu for writing MTT instead of WRITE_MTT hcr command

2014-07-10 Thread Jack Morgenstein
On Wed, 9 Jul 2014 18:40:46 +0530
Bob Biloxi  wrote:

> Hi,
> 
> I was going through the mr.c file as part of understanding WRITE_MTT
> command in the mlx4 code.
> 
> I could see that instead of issuing the WRITE_MTT HCR command, in case
> of SRIOV, we're directly accessing the ICM space for the MTT Table,
> taking the ownership and updating it. We're doing this using
> dma_sync_single_for_cpu and dma_sync_single_for_cpu.
> 
> I was curious as to why this approach was chosen instead of using the
> HCR command.
> 
> Can anyone please explain the reason/motivation behind this approach?
>
Performance. Direct write to memory is much faster than via HCR

-Jack

> 
> 
> Thanks so much,
> 
> 
> Best Regards,
> Marc
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma"
> in the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: mlx4 lockdep splat

2014-07-03 Thread Jack Morgenstein
On Wed, 2 Jul 2014 18:51:17 -0400
Chuck Lever  wrote:

> I built my own according to your description. The fix seems good.
> 

Thanks, Chuck!

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: mlx4 lockdep splat

2014-07-02 Thread Jack Morgenstein
On Tue, 1 Jul 2014 18:17:27 -0400
Chuck Lever  wrote:

> Hi-
> 
> This happens during boot, and means that lockdep and lock_stat are
> completely disabled during any testing I do.
> 
> Does this splat have a known fix, or should I file an upstream
> bugzilla?
> 
> 
> Jul  1 15:50:22 manet kernel: =
> Jul  1 15:50:22 manet kernel: [ INFO: inconsistent lock state ]
> Jul  1 15:50:22 manet kernel: 3.16.0-rc2-00024-g2e78883 #17 Tainted:
> GE Jul  1 15:50:22 manet kernel:
> - Jul  1 15:50:22 manet kernel:
> inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. Jul  1 15:50:22
> manet kernel: swapper/0/0 [HC0[0]:SC1[1]:HE1:SE0] takes: Jul  1
> 15:50:22 manet kernel: (&(&iboe->lock)->rlock){+.?...}, at:
> [] mlx4_ib_addr_event+0xdb/0x1a0 [mlx4_ib] Jul  1
> 15:50:22 manet kernel: {SOFTIRQ-ON-W} state was registered at: Jul  1
> 15:50:22 manet kernel:  []
> mark_irqflags+0x110/0x170 Jul  1 15:50:22 manet kernel:
> [] __lock_acquire+0x2c6/0x5b0 Jul  1 15:50:22 manet
> kernel:  [] lock_acquire+0xe9/0x120 Jul  1 15:50:22
> manet kernel:  [] _raw_spin_lock+0x3e/0x80 Jul  1
> 15:50:22 manet kernel:  []
> mlx4_ib_scan_netdevs+0x34/0x260 [mlx4_ib] Jul  1 15:50:22 manet
> kernel:  [] mlx4_ib_netdev_event+0x2b/0x40
> [mlx4_ib] Jul  1 15:50:22 manet kernel:  []
> register_netdevice_notifier+0x99/0x1e0 Jul  1 15:50:22 manet kernel:
> [] mlx4_ib_add+0x743/0xbc0 [mlx4_ib]

Hi Chuck,

There is a known fix which I need to submit.
The fix is to use lock/unlock_bh instead of spin_lock/unlock.

The netdev event notifier callback (mlx4_ib_addr_event), which also
uses the lock, is called from a soft-interrupt context, so turning off
hard interrupts is not needed here.  _bh spinlocks are sufficient.

-Jack

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [infiniband:for-next 41/47] drivers/net/ethernet/mellanox/mlx4/resource_tracker.c:2875:3: warning: case value '7' not in enumerated type 'enum qp_transition'

2014-06-01 Thread Jack Morgenstein
On Fri, 30 May 2014 15:43:39 -0700
Roland Dreier  wrote:

> Looks like this is actually a bug... assuming you guys agree the patch
> is correct, I'll add the following:
> 
> commit 165cb465f73c (HEAD, mlx4)
> Author: Roland Dreier 
> Date:   Fri May 30 15:38:58 2014
> 
> mlx4_core: Move handling of MLX4_QP_ST_MLX to proper switch
> statement
> 
> The handling of MLX4_QP_ST_MLX in verify_qp_parameters() was
> accidentally put inside the inner switch statement (that handles
> which transition of RC/UC/XRC QPs is happening).  Fix this by moving
> the case to the outer switch statement.
> 
> The compiler pointed this out with:
> 
> drivers/net/ethernet/mellanox/mlx4/resource_tracker.c: In
> function 'verify_qp_parameters':
>  >> drivers/net/ethernet/mellanox/mlx4/resource_tracker.c:2875:3:  
> warning: case value '7' not in enumerated type 'enum qp_transition'
> [-Wswitch]
>case MLX4_QP_ST_MLX:
> 
> Reported-by: kbuild test robot 
> Fixes: 99ec41d0a48c ("mlx4: Add infrastructure for selecting VFs
> to enable QP0 via MLX proxy QPs")
> Signed-off-by: Roland Dreier 
Acked-by: Jack Morgenstein 


Thanks, Roland!  your fix is exactly right!

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: mlx4 qp allocation

2014-02-13 Thread Jack Morgenstein
On Thu, 13 Feb 2014 00:18:22 +0530
Bob Biloxi  wrote:

> The VFs need to allocate the memory for Send Queue Buffer, Receive
> Queue Buffer, Completion Queue Buffer, Event Queue Buffer.
> 
> Is that right?

Yes.

> 
> Also, as the QPs, CQs etc are created by the HCA when ALLOC_RES
> command is issued, does the PF driver need to maintain anything to
> associate the QPs, CQs created by the HCA with owners(VFs) possessing
> them?

Of course. These resources must be de-allocated if, for example, the
VM running the VF crashes -- or we have a resource leak.

This also is used for security checking, to make sure that a VF does
not mess around with resources that do not "belong" to it.

-Jack

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: mlx4 qp allocation

2014-02-11 Thread Jack Morgenstein
On Wed, 29 Jan 2014 15:52:09 +0530
Bob Biloxi  wrote:

> These paths are taken based on the return value of mlx4_is_func(dev).
> This is true for MASTER or SLAVE which I believe is Physical Function
> Driver/Virtual Function Driver. So for SRIOV, it covers all cases.
> 
> The MAP_ICM portion which gets executed as part of __mlx4_qp_alloc_icm
> never gets called??

For slaves (VFs), the command is sent via the comm channel to the
Hypervisor.  It is the Hypervisor which invokes map_icm on behalf of
that slave.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/25] IB/mlx4: fix error return code

2013-12-30 Thread Jack Morgenstein
On Sun, 29 Dec 2013 23:47:20 +0100
Julia Lawall  wrote:

> diff --git a/drivers/infiniband/hw/mlx4/sysfs.c
> b/drivers/infiniband/hw/mlx4/sysfs.c index 97516eb..db2ea31 100644
> --- a/drivers/infiniband/hw/mlx4/sysfs.c
> +++ b/drivers/infiniband/hw/mlx4/sysfs.c
> @@ -582,8 +582,10 @@ static int add_port(struct mlx4_ib_dev *dev, int
> port_num, int slave) p->pkey_group.attrs =
>   alloc_group_attrs(show_port_pkey, store_port_pkey,
> dev->dev->caps.pkey_table_len[port_num]);
> - if (!p->pkey_group.attrs)
> + if (!p->pkey_group.attrs) {
> + ret = -ENOMEM;
>   goto err_alloc;
> + }
>  
>   ret = sysfs_create_group(&p->kobj, &p->pkey_group);
>   if (ret)
> @@ -591,8 +593,10 @@ static int add_port(struct mlx4_ib_dev *dev, int
> port_num, int slave) 
>   p->gid_group.name  = "gid_idx";
>   p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx,
> NULL, 1);
> - if (!p->gid_group.attrs)
> + if (!p->gid_group.attrs) {
> + ret = -ENOMEM;
>   goto err_free_pkey;
> + }
>  

ACK. Julia's patch is correct -- this is indeed a bug-fix.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch v2 3/3] IB/cache: don't fill the cache with junk

2013-10-21 Thread Jack Morgenstein
On Mon, 21 Oct 2013 00:12:54 -0400
Doug Ledford  wrote:

> I think I like my suggestion better: go back to having a full table,
> but use a bitmap to indicate valid entries and then use the bitmap to
> limit our comparisons in the find_cached* functions, and put the
> get_* funtions back to being O(1).  But I would still do that
> incrementally from here I think.
> 
> But I'm not totally convinced of that either.  The exact sitiation I 
> listed above, lots of GIDs on an SRIOV PF, makes me concerned that we 
> can get back to a horrible situation in the find_cached* functions
> once we actually have lots of valid entries.  It makes me think we
> need something better than just a linear search of all valid entries
> when you take SRIOV into account.  Whether hash chains or ranges or
> something to make the lots of valid GIDs case faster, I suspect
> something needs to be done, but because things simply aren't in
> common use yet we don't know it.

Doug, I like your suggestion regarding bitmaps. I would rather hold off
on patch #3, though, because as you say, patches 1 and 2 do most of the
work and the patch #3 optimization won't do much if the GID table is
very populated (which will be the case under SRIOV).

I think what you say is correct, a linear search through a populated
table will be expensive -- and we need to come up with a better
strategy here.

ACK for first 2 patches, please hold off on the third.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch v2 1/3] IB/cma: use cached gids

2013-10-19 Thread Jack Morgenstein
ACK.  Looks good!
Very nice catch, Doug!

-Jack

On Tue, 24 Sep 2013 17:16:27 -0400
Doug Ledford  wrote:

> The cma_acquire_dev function was changed by commit 3c86aa70bf67
> to use find_gid_port because multiport devices might have
> either IB or IBoE formatted gids.  The old function assumed that
> all ports on the same device used the same GID format.  However,
> when it was changed to use find_gid_port, we inadvertently lost
> usage of the GID cache.  This turned out to be a very costly
> change.  In our testing, each iteration through each index of
> the GID table takes roughly 35us.  When you have multiple
> devices in a system, and the GID you are looking for is on one
> of the later devices, the code loops through all of the GID
> indexes on all of the early devices before it finally succeeds
> on the target device.  This pathological search behavior combined
> with 35us per GID table index retrieval results in results such
> as the following from the cmtime application that's part of the
> latest librdmacm git repo:
> 
> ib1:
> step  total ms max ms min us  us / conn
> create id:   29.42   0.04   1.00   2.94
> bind addr:   186705.66  19.00   18556.00   18670.57
> resolve addr :   41.93   9.68 619.00   4.19
> resolve route:  486.93   0.48 101.00  48.69
> create qp: 4021.95   6.18 330.00 402.20
> connect  :68350.39   68588.17   24632.006835.04
> disconnect   : 1460.43 252.65-1862269.00 146.04
> destroy  :   41.16   0.04   2.00   4.12
> 
> ib0:
> step  total ms max ms min us  us / conn
> create id:   28.61   0.68   1.00   2.86
> bind addr: 2178.86   2.95 201.00 217.89
> resolve addr :   51.26  16.85 845.00   5.13
> resolve route:  620.08   0.43  92.00  62.01
> create qp: 3344.40   6.36 273.00 334.44
> connect  : 6435.996368.537844.00 643.60
> disconnect   : 5095.38 321.90 757.00 509.54
> destroy  :   37.13   0.02   2.00   3.71
> 
> Clearly, both the bind address and connect operations suffer
> a huge penalty for being anything other than the default
> GID on the first port in the system.
> 
> After applying this patch, the numbers now look like this:
> 
> ib1:
> step  total ms max ms min us  us / conn
> create id:   30.15   0.03   1.00   3.01
> bind addr:   80.27   0.04   7.00   8.03
> resolve addr :   43.02  13.53 589.00   4.30
> resolve route:  482.90   0.45 100.00  48.29
> create qp: 3986.55   5.80 330.00 398.66
> connect  : 7141.537051.295005.00 714.15
> disconnect   : 5038.85 193.63 918.00 503.88
> destroy  :   37.02   0.04   2.00   3.70
> 
> ib0:
> step  total ms max ms min us  us / conn
> create id:   34.27   0.05   1.00   3.43
> bind addr:   26.45   0.04   1.00   2.64
> resolve addr :   38.25  10.54 760.00   3.82
> resolve route:  604.79   0.43  97.00  60.48
> create qp: 3314.95   6.34 273.00 331.49
> connect  :12399.26   12351.108609.001239.93
> disconnect   : 5096.76 270.721015.00 509.68
> destroy  :   37.10   0.03   2.00   3.71
> 
> It's worth noting that we still suffer a bit of a penalty on
> connect to the wrong device, but the penalty is much less than
> it used to be.  Follow on patches deal with this penalty.
> 
> Many thanks to Neil Horman for helping to track the source of
> slow function that allowed us to track down the fact that
> the original patch I mentioned above backed out cache usage
> and identify just how much that impacted the system.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch v2 3/3] IB/cache: don't fill the cache with junk

2013-10-19 Thread Jack Morgenstein
On Tue, 24 Sep 2013 17:16:29 -0400
Doug Ledford  wrote:

> @@ -85,13 +91,26 @@ int ib_get_cached_gid(struct ib_device *device,
>  
>   cache = device->cache.gid_cache[port_num -
> start_port(device)]; 
> - if (index < 0 || index >= cache->table_len)
> + if (index < 0 || index >= cache->table_len) {
>   ret = -EINVAL;
> - else
> - *gid = cache->table[index];
> + goto out_unlock;
> + }
>  
> - read_unlock_irqrestore(&device->cache.lock, flags);
> + for (i = 0; i < cache->table_len; ++i)
> + if (cache->entry[i].index == index)
> + break;
> +
> +

Hi Doug,

I am a bit concerned about this patch, because where before
ib_get_cached_gid just returned the GID at the given index, with your
suggested change, ib_get_cached_gid() requires a search of the new gid
table (to find the entry with the requested index value).

ib_get_cached_gid is called by cm_req_handler, for the gid at index 0.
There is no guarantee that this will be the first entry in the new
scheme.

Furthermore, ib_get_cached_gid is also called in MAD packet handling,
with the specific gid index that is required.

Thus, the savings for ib_find_cached_gid might possibly be offset by a
performance loss in ib_get_cached_gid.

A simpler optimization would be to simply keep a count of the number of
valid GIDS in the gid table -- and break off the search when the last
valid GID has been seen.  This would optimize cases where, for example,
you are searching for a GID that is not in the table, and only the
first 3 gids in the table are valid (so you would not needlessly access
125 invalid GIDs).  Clearly, such an optimization is only useful when
there are a lot of invalid gids bunched together at the end of the
table.  Still, something to think about.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Mellanox VPI

2013-10-06 Thread Jack Morgenstein
On Sun, 6 Oct 2013 14:53:18 +0400
Vasiliy Tolstov  wrote:

> 2013/10/6 Or Gerlitz :
> > # lspci | grep Mell
> >
> > 06:00.0 InfiniBand: Mellanox Technologies MT26428 [ConnectX VPI
> > PCIe 2.0 5GT/s - IB QDR / 10GigE] (rev b0)
> > 07:00.0 Network controller: Mellanox Technologies MT27520 Family
> >
> > # echo ib > /sys/bus/pci/devices/\:06\:00.0/mlx4_port1
> > # echo eth > /sys/bus/pci/devices/\:06\:00.0/mlx4_port2
> 
> 
> Thanks, but as i see this car have two ports.
> How about eth and ib on the same port in the same time?
> 
You cannot have ETH and IB link layers existing on the same port at the
same time.

However, you can run IB applications over an ETH link layer by using
RoCE (RDMA over Converged Ethernet). If you do:

ibv_devinfo

on a host where an HCA has a port configured to an ETH link layer,
you should see an entry for the ETH layer port.  Under that port
number, you will see:

...
link_layer: Ethernet

-Jack

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 51/77] mthca: Update MSI/MSI-X interrupts enablement code

2013-10-03 Thread Jack Morgenstein
On Wed,  2 Oct 2013 12:49:07 +0200
Alexander Gordeev  wrote:

> Subject: [PATCH RFC 51/77] mthca: Update MSI/MSI-X interrupts
> enablement code Date: Wed,  2 Oct 2013 12:49:07 +0200
> Sender: linux-rdma-ow...@vger.kernel.org
> X-Mailer: git-send-email 1.7.7.6
> 
> As result of recent re-design of the MSI/MSI-X interrupts enabling
> pattern this driver has to be updated to use the new technique to
> obtain a optimal number of MSI/MSI-X interrupts required.
> 
> Signed-off-by: Alexander Gordeev 
> ---

ACK.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 46/77] mlx4: Update MSI/MSI-X interrupts enablement code

2013-10-03 Thread Jack Morgenstein
On Wed,  2 Oct 2013 12:49:02 +0200
Alexander Gordeev  wrote:

> As result of recent re-design of the MSI/MSI-X interrupts enabling
> pattern this driver has to be updated to use the new technique to
> obtain a optimal number of MSI/MSI-X interrupts required.
> 
> Signed-off-by: Alexander Gordeev 

New review -- ACK (i.e., patch is OK), subject to acceptance of patches
05 and 07 of this patch set.

I sent my previous review (NACK) when I was not yet aware that
changes proposed were due to the two earlier patches (mentioned above)
in the current patch set.

The change log here should actually read something like the following:

As a result of changes to the MSI/MSI_X enabling procedures, this driver
must be modified in order to preserve its current msi/msi_x enablement
logic.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 46/77] mlx4: Update MSI/MSI-X interrupts enablement code

2013-10-03 Thread Jack Morgenstein
On Wed,  2 Oct 2013 12:49:02 +0200
Alexander Gordeev  wrote:

UPDATING THIS REPLY.
Your change log confused me. The change below is not from a "recent
re-design", it is required due to an earlier patch in this patch set.
>From the log, I assumed that the change you are talking about is already
upstream.

I will re-review.

-Jack

NACK.  This change does not do anything logically as far as I can tell.
pci_enable_msix in the current upstream kernel itself calls
pci_msix_table_size.  The current code yields the same resultswill
as the code suggested below. (i.e., the suggested code has no effect on
optimality).

BTW, pci_msix_table_size never returns a value < 0 (if msix is not
enabled, it returns 0 for the table size), so the (err < 0) check here
is not correct. (I also do not like using "err" here anyway for the
value returned by pci_msix_table_size().  There is no error here, and
it is simply confusing.

-Jack

> As result of recent re-design of the MSI/MSI-X interrupts enabling
> pattern this driver has to be updated to use the new technique to
> obtain a optimal number of MSI/MSI-X interrupts required.
> 
> Signed-off-by: Alexander Gordeev 
> ---
>  drivers/net/ethernet/mellanox/mlx4/main.c |   17 -
>  1 files changed, 8 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c
> b/drivers/net/ethernet/mellanox/mlx4/main.c index 60c9f4f..377a5ea
> 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/main.c
> @@ -1852,8 +1852,16 @@ static void mlx4_enable_msi_x(struct mlx4_dev
> *dev) int i;
>  
>   if (msi_x) {
> + err = pci_msix_table_size(dev->pdev);
> + if (err < 0)
> + goto no_msi;
> +
> + /* Try if at least 2 vectors are available */
>   nreq = min_t(int, dev->caps.num_eqs -
> dev->caps.reserved_eqs, nreq);
> + nreq = min_t(int, nreq, err);
> + if (nreq < 2)
> + goto no_msi;
>  
>   entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL);
>   if (!entries)
> @@ -1862,17 +1870,8 @@ static void mlx4_enable_msi_x(struct mlx4_dev
> *dev) for (i = 0; i < nreq; ++i)
>   entries[i].entry = i;
>  
> - retry:
>   err = pci_enable_msix(dev->pdev, entries, nreq);
>   if (err) {
> - /* Try again if at least 2 vectors are
> available */
> - if (err > 1) {
> - mlx4_info(dev, "Requested %d
> vectors, "
> -   "but only %d MSI-X vectors
> available, "
> -   "trying again\n", nreq,
> err);
> - nreq = err;
> - goto retry;
> - }
>   kfree(entries);
>   goto no_msi;
>   }

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 46/77] mlx4: Update MSI/MSI-X interrupts enablement code

2013-10-03 Thread Jack Morgenstein
On Wed,  2 Oct 2013 12:49:02 +0200
Alexander Gordeev  wrote:

NACK.  This change does not do anything logically as far as I can tell.
pci_enable_msix in the current upstream kernel itself calls
pci_msix_table_size.  The current code yields the same results
as the code suggested below. (i.e., the suggested code has no effect on
optimality).

BTW, pci_msix_table_size never returns a value < 0 (if msix is not
enabled, it returns 0 for the table size), so the (err < 0) check here
is not correct. (I also do not like using "err" here anyway for the
value returned by pci_msix_table_size().  There is no error here, and
it is simply confusing.

-Jack

> As result of recent re-design of the MSI/MSI-X interrupts enabling
> pattern this driver has to be updated to use the new technique to
> obtain a optimal number of MSI/MSI-X interrupts required.
> 
> Signed-off-by: Alexander Gordeev 
> ---
>  drivers/net/ethernet/mellanox/mlx4/main.c |   17 -
>  1 files changed, 8 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c
> b/drivers/net/ethernet/mellanox/mlx4/main.c index 60c9f4f..377a5ea
> 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/main.c
> @@ -1852,8 +1852,16 @@ static void mlx4_enable_msi_x(struct mlx4_dev
> *dev) int i;
>  
>   if (msi_x) {
> + err = pci_msix_table_size(dev->pdev);
> + if (err < 0)
> + goto no_msi;
> +
> + /* Try if at least 2 vectors are available */
>   nreq = min_t(int, dev->caps.num_eqs -
> dev->caps.reserved_eqs, nreq);
> + nreq = min_t(int, nreq, err);
> + if (nreq < 2)
> + goto no_msi;
>  
>   entries = kcalloc(nreq, sizeof *entries, GFP_KERNEL);
>   if (!entries)
> @@ -1862,17 +1870,8 @@ static void mlx4_enable_msi_x(struct mlx4_dev
> *dev) for (i = 0; i < nreq; ++i)
>   entries[i].entry = i;
>  
> - retry:
>   err = pci_enable_msix(dev->pdev, entries, nreq);
>   if (err) {
> - /* Try again if at least 2 vectors are
> available */
> - if (err > 1) {
> - mlx4_info(dev, "Requested %d
> vectors, "
> -   "but only %d MSI-X vectors
> available, "
> -   "trying again\n", nreq,
> err);
> - nreq = err;
> - goto retry;
> - }
>   kfree(entries);
>   goto no_msi;
>   }

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Write combining support in the upstream kernel

2013-09-02 Thread Jack Morgenstein
Hi Roland,

This is a re-posting (and rewording) of a question I sent you on July 6,
2009.

I've been looking at the write-combining support in the kernel,
and it looks good. The caller simply invokes pgprot_writecombine() and
if write combining is available, the region is mapped for it (if wc is
not available, the regions is mapped as non-cached).

However, the API silently activates write combining without providing
any architecture-independent means of knowing whether write combining
is enabled or not. 

For example, in X86 the procedure pgprot_writecombine is as follows:
 pgprot_t pgprot_writecombine(pgprot_t prot)
 {
 if (pat_enabled)
 return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
 else
 return pgprot_noncached(prot);
 }

Note that pat_enabled is an architecture-dependent variable!

Silent activation of WC is OK in situations where for feature X, if
write-combining is available, X works better and the driver's
performance improves. (the driver simply calls pgprot_writecombine(),
and if WC is available it is activated for the region; if it is not
available, the region is mapped in the usual fashion).

However, what about situations where we wish to enable feature X ONLY
if write combining is available? (In this case the driver cannot simply
call pgprot_writecombine() not knowing if write-combining is really
used or not).

The required logic here is:
if (write-combining is available)
Activate feature X, and use pgprot_writecombine() for
its regions;
else
Do NOT activate feature X.

In MLNX_OFED, to get around this problem, I introduced some
architecture-dependent wrapper functions to take care of this (where
these functions simply indicate in a fixed manner whether write
combining is enabled for specific architectures):

#include 
#include "wc.h"

#if defined(__i386__) || defined(__x86_64__)

pgprot_t pgprot_wc(pgprot_t _prot)
{
return pgprot_writecombine(_prot);
}

int mlx4_wc_enabled(void)
{
return 1;
}

#elif defined(CONFIG_PPC64)

pgprot_t pgprot_wc(pgprot_t _prot)
{
return __pgprot((pgprot_val(_prot) | _PAGE_NO_CACHE) &
 ~(pgprot_t)_PAGE_GUARDED);
}

int mlx4_wc_enabled(void)
{
return 1;
}

#else   /* !(defined(__i386__) || defined(__x86_64__)) */

pgprot_t pgprot_wc(pgprot_t _prot)
{
return pgprot_noncached(_prot);
}

int mlx4_wc_enabled(void)
{
return 0;
}

#endif

I then use mlx4_wc_enabled() to determine whether or not to use
blueflame (which is feature X in this case):

static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct
ib_device *ibdev, struct ib_udata *udata)
{


===>if (mlx4_wc_enabled()) {
resp.bf_reg_size  = dev->dev->caps.bf_reg_size;
resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
} else {
resp.bf_reg_size  = 0;
resp.bf_regs_per_page = 0;
}

I would like, though, to have the capability in the kernel API to
determine if write-combining is available on a given host.

I thought of possibly comparing the result returned by
pgprot_writecombine(prot) to that returned by pgprot_noncached(prot)
-- if they are identical, then assume that write-combining is not
supported. (pgprot_noncached() is the default mapping of
pgprot_writecombine if it is not defined under the arch directory --
see file include/linux/pgtable.h).

This has a problem, however, in that I have no way of determining what
value of "prot" to use when doing this comparison -- there may be some
architectures which use bits of the prot structure to determine per
specific call whether or not to use write-combining (i.e.,
pgprot_writecombine(prot) could invoke pgprot_noncached(prot) if
certain bits were set in the prot structure, or return a
write-combining prot value if those bits are not set).

Using a zeroed-out pgprot structure in the comparison, for example, may
not be appropriate. (we may be allowing blueflame when it should not
be, or preventing blueflame when it should be allowed).

Do you have any ideas for how to determine if in fact write-combining
is available? How about introducing an external variable (say
extern int write_combining_active) which would be initialized by the
kernel (per architecture) to be 1 or 0? 

-Jack

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ibv_create_qp() and max_inline_data behavior

2013-08-28 Thread Jack Morgenstein
On Wed, 28 Aug 2013 13:45:17 -0400
do...@rdmamojo.com wrote:

> Hi Yann.
> 
> 
> >
> > I'd like the behavior of ibv_create_qp() regarding
> > cap.max_inline_data be clearer:
> >
> > 1) Should ibv_create_qp() failed if max_inline_data requested being
> > greater than supported ?
> Since the max_inline_data is an extension, the InfiniBand spec doesn't
> specify the expected behavior.  However, the spirit of the spec is to
> return a failure if not all of the requested attributes can be
> satisfied.
> 

In the Infiniband Spec revision 1.2.1, section 11.2.4.1 CREATE QUEUE
PAIR, we see that the output modifiers specify that for all input
parameters, the queue be created with at least the number of resources
specified.
For example:
• The actual number of scatter/gather elements that can be speci-
  fied in Work Requests submitted to the Send Queue. If an error is
  not returned, this is guaranteed to be greater than or equal to the
  number requested.

Thus, although the max_inline_data is an extension, it should clearly
behave in exactly the same fashion.

-Jack


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH for-next 0/8] Add Mellanox mlx5 driver for Connect-IB devices

2013-07-07 Thread Jack Morgenstein
On Wednesday 03 July 2013 22:26, Roland Dreier wrote:
> Look at the actual timer code.  del_timer_sync() won't work if
> something unrelated re-adds the timer, but it will work if the timer
> itself is what re-adds itself.
> 
> Documentation/DocBook/kernel-locking.tmpl says:
> 
>       Another common problem is deleting timers which restart
>       themselves (by calling add_timer() at the end
>       of their timer function).  Because this is a fairly common case
>       which is prone to races, you should use
> del_timer_sync()
>       (include/linux/timer.h)
>       to handle this case.  It returns the number of times the timer
>       had to be deleted before we finally stopped it from adding itself back
>       in.
> 
> which pretty clearly says that del_timer_sync() will work in this case.
> 
> Or look at the code using it in arch/sparc/kernel/led.c for example
> (just one of the first hits in my grep, there are many other
> examples).
> 
> Not a big deal but I'm pretty sure the flag isn't needed.

Thanks for the feedback, Roland!

You are correct, and I removed the "active" flag in V3 of the patch set (to be 
submitted shortly).

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V2 1/9] net/mlx5: Mellanox Connect-IB, core driver part 1/3

2013-07-04 Thread Jack Morgenstein
On Thursday 04 July 2013 17:17, Joe Perches wrote:
> Not the vertical spacing, the newline inside the quotes.
> ie: "unknown status code\n" should be "unknown status code"
> 
You're right!  I misunderstood, and did not notice the newline char.
Good catch!

While I was at it, though, I eliminated the blank lines between
the cases/default -- fewer lines in the file, and readable enough
this way.

Thanks!
-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V2 5/9] IB/mlx5: Mellanox Connect-IB, IB driver part 1/5

2013-07-04 Thread Jack Morgenstein
On Thursday 04 July 2013 16:15, Jack Morgenstein wrote:
> > > +   *inlen = sizeof **cqb + sizeof *(*cqb)->pas * ncont;
> > 
> > sizeof always uses parentheses
> 
I'll fix this, too.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V2 7/9] IB/mlx5: Mellanox Connect-IB, IB driver part 3/5

2013-07-04 Thread Jack Morgenstein
On Thursday 04 July 2013 00:10, Joe Perches wrote:
> On Wed, 2013-07-03 at 20:13 +0300, Or Gerlitz wrote:
> > From: Eli Cohen 
> 
> More trivia:
> 
> > diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h 
> > b/drivers/infiniband/hw/mlx5/mlx5_ib.h
> []
> > +#define mlx5_ib_dbg(dev, format, arg...)   \
> > +do {   
> > \
> > +   pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name,  \
> > +__func__, __LINE__, current->pid, ##arg);  \
> > +} while (0)
> 
> unnecessary do {} while (0)

In this case, you are correct. The pr_debug macro itself does this internally
where needed -- not surprising, since pr_debug must also be usable for code 
like:
if (foo)
pr_debug(...).
I'll remove the do-while in V3.

> > +static void clean_keys(struct mlx5_ib_dev *dev, int c)
> > +{
> > +   struct device *ddev = dev->ib_dev.dma_device;
> > +   struct mlx5_mr_cache *cache = &dev->cache;
> > +   struct mlx5_cache_ent *ent = &cache->ent[c];
> > +   struct mlx5_ib_mr *mr;
> > +   int size;
> > +   int err;
> > +
> > +   while (1) {
> > +   spin_lock(&ent->lock);
> > +   if (list_empty(&ent->head)) {
> > +   spin_unlock(&ent->lock);
> > +   return;
> > +   }
> > +   mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
> > +   list_del(&mr->list);
> > +   ent->cur--;
> > +   ent->size--;
> > +   spin_unlock(&ent->lock);
> > +   err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr);
> > +   if (err) {
> > +   mlx5_ib_warn(dev, "failed destroy mkey\n");
> 
> Are you leaking anything here by not freeing?
Actually, if the mkey-destroy fails, it is extremely risky to free
the memory resources it uses.  A tiny memory leak (in such a rare case)
is far preferable to a kernel crash.

> 
> > +   } else {
> > +   size = ALIGN(sizeof(u64) * (1 << mr->order), 0x40);
> > +   dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE);
> > +   kfree(mr->pas);
> > +   kfree(mr);
> > +   }
> > +   };
> > +}
> 
> > +static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
> > +u64 length, struct ib_umem *umem,
> > +int npages, int page_shift,
> > +int access_flags)
> > +{
> []
> > +   mr = kzalloc(sizeof(*mr), GFP_KERNEL);
> > +   if (!mr) {
> > +   mlx5_ib_warn(dev, "allocation failed\n");
> 
> Another unnecessary OOM

Will remove for V3 

> > +   mr = ERR_PTR(-ENOMEM);
> > +   }
> > +
> > +   inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2;
> > +   in = vzalloc(inlen);
> > +   if (!in) {
> > +   mlx5_ib_warn(dev, "alloc failed\n");
> 
> here too.

Will remove for V3 

> > +   err = -ENOMEM;
> > +   goto err_1;
> > +   }
> 
Thanks again for the review!

-Jack 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V2 1/9] net/mlx5: Mellanox Connect-IB, core driver part 1/3

2013-07-04 Thread Jack Morgenstein
On Wednesday 03 July 2013 23:29, Joe Perches wrote:
> On Wed, 2013-07-03 at 20:13 +0300, Or Gerlitz wrote:
> > From: Eli Cohen 
> 
> trivial comments:
> 
> > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c 
> > b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
> []
> > +static const char *deliv_status_to_str(u8 status)
> > +{
> > +   switch (status) {
> > +   case MLX5_CMD_DELIVERY_STAT_OK:
> > +   return "no errors";
> []
> > +   default:
> > +   return "unknown status code\n";
> > +   }
> > +}
> Likely unnecessary newline for default case
All the cases here have newlines between them, to enhance readability.
(not just the default). If you feel strongly about this, I'll do the change
for V3.

> > +static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev,
> > + gfp_t flags)
> > +{
> > +   struct mlx5_cmd_mailbox *mailbox;
> > +
> > +   mailbox = kmalloc(sizeof(*mailbox), flags);
> > +   if (!mailbox) {
> > +   mlx5_core_dbg(dev, "failed allocation\n");
> > +   return ERR_PTR(-ENOMEM);
> > +   }
> 
> unnecessary OOM message.

Will remove these kmalloc failure printouts in V3
> 
> > +static void set_wqname(struct mlx5_core_dev *dev)
> > +{
> > +   struct mlx5_cmd *cmd = &dev->cmd;
> > +
> > +   strcpy(cmd->wq_name, "mlx5_cmd_");
> > +   strcat(cmd->wq_name, dev_name(&dev->pdev->dev));
> 
> More likely snprintf might be better.
> 
>   snprintf(cmd->wq_name, sizeof(cmd->wq_name), "mlx5_cmd_%s",
>dev_name(&dev->pdev->dev));
> 
Will change this for V3
> How big is wq_name?
32 bytes. Should be enough to hold dev names. However, to be on the safe side
I'll do the change.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V2 5/9] IB/mlx5: Mellanox Connect-IB, IB driver part 1/5

2013-07-04 Thread Jack Morgenstein
On Wednesday 03 July 2013 23:59, Joe Perches wrote:
> On Wed, 2013-07-03 at 20:13 +0300, Or Gerlitz wrote:
> > From: Eli Cohen 
> > diff --git a/drivers/infiniband/hw/mlx5/ah.c 
> > b/drivers/infiniband/hw/mlx5/ah.c
> []
> > +struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
> > +  struct mlx5_ib_ah *ah)
> > +{
> > +   u32 sgi;
> 
> sgi is used once here and looks more confusing than helpful
> 
Will fix in V3
> 
> []
> 
> > +static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n)
> > +{
> > +   void *cqe = get_cqe(cq, n & cq->ibcq.cqe);
> > +   struct mlx5_cqe64 *cqe64;
> > +
> > +   cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
> > +   return ((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^
> > +   !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
> 
> I think "foo ^ !!bar" is excessively tricky.
> 
The mlx4 driver already uses "!!foo ^ !!bar" in several places in the kernel 
(and this is old code).
I assume that your problem with the above code is that it uses "foo" and not 
"!!foo".

Please note, though, that this code is data-path code -- and in this specific 
case,  foo = !!foo
(since MLX5_CQE_OWNER_MASK = 1). We decided, therefore, in this specific case, 
not to add the unnecessary
"!!", even though at first glance it may look tricky -- performance here IMHO 
is more important.

> > +static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx)
> > +{
> 
> > +   pr_warn("unkonwn completion status\n");
> 
> unknown tyop

Will fix in V3

> []
> 
> > +static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
> > + struct ib_ucontext *context, struct mlx5_ib_cq *cq,
> > + int entries, struct mlx5_create_cq_mbox_in **cqb,
> > + int *cqe_size, int *index, int *inlen)
> []
> > +   *inlen = sizeof **cqb + sizeof *(*cqb)->pas * ncont;
> 
> sizeof always uses parentheses
> 
> > +   *cqb = vzalloc(*inlen);
> 
> Perhaps you may be using vzalloc too often.
> 
> Maybe you should have a helper allocating either
> from kmalloc or vmalloc as necessary based on size.
We will look into this.

Thanks for reviewing!

-Jack

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH for/net-next 3/8] net/mlx5: Mellanox Connect-IB, core driver part 3/3

2013-07-02 Thread Jack Morgenstein
On Monday 01 July 2013 22:18, David Miller wrote:
> Actually, you should adjust both decrements to read "i--".
> 
OK, will do it.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH for/net-next 3/8] net/mlx5: Mellanox Connect-IB, core driver part 3/3

2013-07-01 Thread Jack Morgenstein
On Saturday 29 June 2013 07:10, David Miller wrote:
> From: Or Gerlitz 
> Date: Wed, 26 Jun 2013 17:22:12 +0300
> 
> > +   for (--i; i >= 0; --i) {
> 
> Please, "i--" is more canonical in for() loops.
> 
> > +   for (--i; i >= 0; --i) {
> 
> Likewise.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
Hi Dave,

For the "for" loop initial value, "i" should be decremented before doing any
for-loop calculations (and it is not at all obvious if this is the ordering if 
we use
i--, and not --i).  Using --i in the initial value makes the ordering obvious.
However, I do agree with respect to the increment that --i and i-- are logically
identical.

Thus, the "for" loop could read:
for (--i; i >= 0; i--) {

However, my own personal opinion is that this is a bit confusing.
I would prefer to leave these lines as they are.

Is that OK with you?

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: MLX4 Cq Question

2013-05-21 Thread Jack Morgenstein
On Tuesday 21 May 2013 13:43, Or Gerlitz wrote:
> On 21/05/2013 13:42, Bart Van Assche wrote:
> > On 05/21/13 11:40, Or Gerlitz wrote:
> >> 2. is possible in the Linux kernel for one hard irq callback to flash on
> >> CPU X while another hard irq callback is running on the same CPU?
> >
> > I think that from kernel 2.6.35 on MSI IRQs are no longer nested. See 
> > also 
> > http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=753649dbc49345a73a2454c770a3f2d54d11aec6
> >  
> > or http://lwn.net/Articles/380931/
> 
> thanks, so suppose we agree on that, still the patch makes sense as the 
> race is there, but does the patch has to change?
> 
> Or.
> 
I just need to verify that the patch can be applied correctly on the upstream 
kernel.
The use of RCU (and not spinlock) makes sense from a performance standpoint
in any case. We do NOT want to force mlx4_cq_completion to have a spinlock
which is device-global, resulting in having completion event processing be
single-threaded in effect).
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: MLX4 Cq Question

2013-05-20 Thread Jack Morgenstein
ollowing a bit too deep into the chains. In the other code which uses
this radix tree the access is protected by the lock but
mlx4_cq_completion is running in the interrupt context and cannot
take locks, so instead it runs without any protection whatsoever."

The stack trace below is from the mlnx ofed 1.5.3 driver running under RHEL5.7.
(this driver uses the upstream kernel mlx4_cq_completion() code)

PID: 8178   TASK: 810b91a52400  CPU: 11  COMMAND: "universal_consu"

[exception RIP: radix_tree_lookup+38]
RIP: 8016405f  RSP: 81182fe3be90  RFLAGS: 00210002
RAX: 6b6b6b6b6b6b6b8b  RBX: 810c2fb9  RCX: 0006
RDX: 0001  RSI: 00c0  RDI: 6b6b6b6b6b6b6b6b
RBP: 00c0   R8: 0001   R9: 0920ea94
R10: 00b3  R11: 800c7ea5  R12: 00b3
R13: 810c2b15a280  R14: 810b7a98ff58  R15: 
ORIG_RAX:   CS: 0010  SS: 

***  ***

RIP: f5cd8859  RSP: f3eafa28  RFLAGS: 00200202
RAX:   RBX: f3eafaf8  RCX: 0b6d4f00
RDX: 001c  RSI: 001c  RDI: 
RBP: ffff810bf1a0a120   R8: 0000   R9: 0000
R10:   R11: 0000  R12: 0001
R13:   R14: 810b9e7d5bf0  R15: 
ORIG_RAX: ff4c  CS: 0023  SS: 002b

The original patch  was generated by Yishai Hadas,
and reviewed by Or Gerlitz and Jack Morgenstein.  A subsequent fix
by Jack Morgenstein was reviewed by Eli Cohen.

Signed-off-by: Jack Morgenstein 
---
 drivers/net/ethernet/mellanox/mlx4/cq.c |   14 --
 1 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c 
b/drivers/net/ethernet/mellanox/mlx4/cq.c
index ff91904..0e28258 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -57,8 +57,13 @@ void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
 {
struct mlx4_cq *cq;
 
+   rcu_read_lock();
cq = radix_tree_lookup(&mlx4_priv(dev)->cq_table.tree,
   cqn & (dev->caps.num_cqs - 1));
+   if (cq)
+   atomic_inc(&cq->refcount);
+   rcu_read_unlock();
+
if (!cq) {
mlx4_dbg(dev, "Completion event for bogus CQ %08x\n", cqn);
return;
@@ -67,6 +72,9 @@ void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn)
++cq->arm_sn;
 
cq->comp(cq);
+
+   if (atomic_dec_and_test(&cq->refcount))
+   complete(&cq->free);
 }
 
 void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int event_type)
@@ -74,13 +82,13 @@ void mlx4_cq_event(struct mlx4_dev *dev, u32 cqn, int 
event_type)
struct mlx4_cq_table *cq_table = &mlx4_priv(dev)->cq_table;
struct mlx4_cq *cq;
 
-   spin_lock(&cq_table->lock);
+   rcu_read_lock();
 
cq = radix_tree_lookup(&cq_table->tree, cqn & (dev->caps.num_cqs - 1));
if (cq)
atomic_inc(&cq->refcount);
 
-   spin_unlock(&cq_table->lock);
+   rcu_read_unlock();
 
if (!cq) {
mlx4_warn(dev, "Async event for bogus CQ %08x\n", cqn);
@@ -328,6 +336,7 @@ err_radix:
spin_lock_irq(&cq_table->lock);
radix_tree_delete(&cq_table->tree, cq->cqn);
spin_unlock_irq(&cq_table->lock);
+   synchronize_rcu();
 
 err_icm:
mlx4_cq_free_icm(dev, cq->cqn);
@@ -351,6 +360,7 @@ void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq)
spin_lock_irq(&cq_table->lock);
radix_tree_delete(&cq_table->tree, cq->cqn);
spin_unlock_irq(&cq_table->lock);
+   synchronize_rcu();
 
if (atomic_dec_and_test(&cq->refcount))
complete(&cq->free);
-- 
1.7.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Mellanox/RoCE - VLAN support

2013-04-13 Thread Jack Morgenstein
On Friday 12 April 2013 02:13, Or Gerlitz wrote:
> Klaus Wacker  wrote:
> > Hi all, we are doing a project on IBM x-blades using Mellanox/RoCE. I have 
> > some
> > difficulties to get RoCE based VLANs to run.
> > We are currently on Linux 3.8. No OFED material involved.
> > While non-VLAN devices work ok, i experience the following on VLAN devs
> > - a GID for the VLAN is created and we use the proper index for gid source 
> > addr.
> > - a ping on the VLAN-device runs ok
> > - rdma/roce traffic fails in that the sender overruns the send retry count.
> 
> Do you have any evidence that the packets are actually crossing the
> switch? the way we debug RoCE connectivity issues is configuring port
> mirroring on the switch and
> capture the mirrored packets on the host where packets are mirrored,
> and feed them to wireshark (the official/latest versions support
> parsing of RoCE/IB packets), on that host you can also observe  L2
> mac/vlan header of the packets to see if it matches what you expected.
> But even before that, I would check the switch counters to see the
> packets aren't dropped.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
Thanks, Or!

Klaus,
In addition, please provide us with the following information:
1. Which ConnectX FW version are you using?
2. Are you using RC or UD qp? (I assume RC if you have a retry count failure, 
but I would like to be sure).
3. Can you successfully run ucmatose? (tests rdma library for RC connections).
To run ucmatose:
on server:  ucmatose
on client:  ucmatose -s 

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/mlx4: Fail post send command on error recovery

2013-04-04 Thread Jack Morgenstein
On Thursday 04 April 2013 16:01, Kleber Sacilotto de Souza wrote:
> On 04/02/2013 02:00 PM, Roland Dreier wrote:
> >> diff --git a/drivers/infiniband/hw/mlx4/qp.c 
> >> b/drivers/infiniband/hw/mlx4/qp.c
> >> index 35cced2..0fa4f72 100644
> >> --- a/drivers/infiniband/hw/mlx4/qp.c
> >> +++ b/drivers/infiniband/hw/mlx4/qp.c
> >> @@ -2216,6 +2216,9 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct 
> >> ib_send_wr *wr,
> >> __be32 blh;
> >> int i;
> >>
> >> +   if (pci_channel_offline(to_mdev(ibqp->device)->dev->pdev))
> >> +   return -EIO;
> >> +
> >> spin_lock_irqsave(&qp->sq.lock, flags);
> >>
> >> ind = qp->sq_next_wqe;
> > 
> > To pile on to what Or and Jack asked, why here?  Why not in post_recv?
> >  Why not in mlx4_en?  What about userspace consumers?  What if the
> > error condition triggers just after the pci_channel_offline() check?
> > What if a command is queued but a PCI error occurs before the
> > completion can be returned?
> > 
> > Is there some practical scenario where this change makes a difference?
> > 
> > I would assume that in case of a PCI error, the driver would notice a
> > catastrophic error and send that asynchronous event to consumers, who
> > would know that commands might have been lost.
> > 
> 
> The problem that I'm trying to solve is that some IB core modules are
> hanging waiting on completion queues on their remove path during error
> recovery. I've added the pci offline check in post_send, which seemed to
> have to solved the problem, but while running other tests I was able to
> hit the bug again. Adding the check in post_recv also only hid the
> problem for a few testcases.
> 
> Adding any check in mlx4_en doesn't make sense in this case, because the
> problem is only with IB adapters. The ethernet/RoCE adapters are
> recovering fine, the check has been added already on the relevant places
> in mlx4_core.
> 
> What async event should be sent to consumers before calling the remove
> functions? IB_EVENT_DEVICE_FATAL, which is currently sent by mlx4_core
> in case of catastrophic error (but not in PCI error recovery), doesn't
> seem to be handled by most of the event handlers registered. Sending
> IB_EVENT_PORT_ERR seems to solve the problem for most modules, but
> rdma_cm, which doesn't have an event handler, is still hanging. Should
> we implement an event handler for rdma_cm?
>

This won't really help unless ALL userspace apps respond by calling 
ibv_close_device.
You can check this by running ibv_asyncwatch  (in libibverbs/examples). Until 
ibv_asyncwatch
is exited the low-level device restart won't work.

-Jack
> 
> Thanks!
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/mlx4: Fail post send command on error recovery

2013-04-02 Thread Jack Morgenstein
On Tuesday 02 April 2013 12:15, Or Gerlitz wrote:
> On 28/03/2013 20:45, Kleber Sacilotto de Souza wrote:
> > When the PCI adapter is going through error recovery, a call to
> > mlx4_ib_post_send() will return success without the command actually
> > arriving to the hardware. Adding a call to pci_channel_offline() to
> > check the state of the PCI slot and returning an error will allow the
> > upper layers to be aware that the command didn't succeed.
> 
> Is putting this call in fast path common practice in other (e.g 
> Ethernet) drivers?
> 
> Or.
> 
In addition, have you done any timing tests to see how this affects the
data path?  I am very concerned here that you penalize normal performance
for the sake of a very rare corner case.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] IB/mlx4: silence GCC warning

2013-02-25 Thread Jack Morgenstein
On Monday 25 February 2013 19:23, Roland Dreier wrote:
> On Mon, Feb 25, 2013 at 8:54 AM, Roland Dreier  wrote:
> > I'm finally noticing that this is in the build_mlx_header() function,
> > which is pretty much a slow path.  Certainly another compare isn't
> > going to change performance given all the other stuff we do there.
> >
> > Let me look at the patches that have gone by and see what the cleanest
> > way to handle this is.
> 
> OK, after playing around a bit, I see that just initializing vlan
> doesn't really change the generated code (my gcc at least was already
> if effect setting vlan in the generated assembly code), so I'll just
> merge that.
> 
>  - R.

Thanks!

-Jack 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] IB/mlx4: silence GCC warning

2013-02-24 Thread Jack Morgenstein
On Thursday 21 February 2013 11:02, Paul Bolle wrote:

> diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
> index 19e0637..512fde3 100644
> --- a/drivers/infiniband/hw/mlx4/qp.c
> +++ b/drivers/infiniband/hw/mlx4/qp.c
> @@ -1778,8 +1778,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, 
> struct ib_send_wr *wr,
>   }
>  
>   vlan = rdma_get_vlan_id(&sgid);
> - is_vlan = vlan < 0x1000;
>   }
Nice try!
However, this approach does add the line below to processing for an IB port 
(ETH/RoCE port stays same, more or less).
Processing time is therefore increased (at least on the IB side) relative to 
just living with the warning.

Roland?

> + is_vlan = vlan < 0x1000;  <=== Code line added to IB-side processing.

>   ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, 
> &sqp->ud_header);
>  
>   if (!is_eth) {

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Fwd: Error message when trying to use Infiniband virtual functions in virtual machine

2013-01-29 Thread Jack Morgenstein
; 11808 for ICM.
> [ 1425.576175] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 1180c for ICM.
> [ 1425.577533] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11810 for ICM.
> [ 1425.578899] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11814 for ICM.
> [ 1425.580246] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11818 for ICM.
> [ 1425.581593] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 1181c for ICM.
> [ 1425.582929] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11820 for ICM.
> [ 1425.584289] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11824 for ICM.
> [ 1425.585645] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11828 for ICM.
> [ 1425.587011] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 1182c for ICM.
> [ 1425.588376] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11830 for ICM.
> [ 1425.589740] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11834 for ICM.
> [ 1425.591098] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11838 for ICM.
> [ 1425.592460] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 1183c for ICM.
> [ 1425.593820] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11840 for ICM.
> [ 1425.595170] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11844 for ICM.
> [ 1425.596532] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11848 for ICM.
> [ 1425.597912] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 1184c for ICM.
> [ 1425.599257] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11850 for ICM.
> [ 1425.600610] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11854 for ICM.
> [ 1425.601967] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11858 for ICM.
> [ 1425.603314] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 1185c for ICM.
> [ 1425.604674] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11860 for ICM.
> [ 1425.606018] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11864 for ICM.
> [ 1425.607374] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11868 for ICM.
> [ 1425.608741] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 1186c for ICM.
> [ 1425.610095] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11870 for ICM.
> [ 1425.611456] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11874 for ICM.
> [ 1425.612815] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 11878 for ICM.
> [ 1425.614165] mlx4_core :82:00.0: Mapped 1 chunks/256 KB at
> 1187c for ICM.
> [ 1426.193222] mlx4_core :82:00.0: Started init_resource_tracker: 80 
> slaves
> [ 1426.193692] mlx4_core :82:00.0: irq 178 for MSI/MSI-X
> [ 1426.193705] mlx4_core :82:00.0: irq 179 for MSI/MSI-X
> [ 1426.200029] mlx4_core :82:00.0: NOP command IRQ test passed
> [ 1426.200605] mlx4_core: Initializing :05:00.1
> [ 1426.200618] mlx4_core :05:00.1: enabling device ( -> 0002)
> [ 1426.200654] mlx4_core :05:00.1: Skipping virtual function:1
> [ 1426.200674] mlx4_core: Initializing :82:00.1
> [ 1426.200685] mlx4_core :82:00.1: enabling device ( -> 0002)
> [ 1426.200711] mlx4_core :82:00.1: Skipping virtual function:1
> [ 1438.961741] pci-stub :82:00.1: claimed by stub
> [ 1438.967250] pci-stub :82:00.1: enabling device ( -> 0002)
> [ 1438.969739] mlx4_core :82:00.0: FLR event for slave: 1
> [ 1438.969775] mlx4_core :82:00.0: mlx4_handle_slave_flr
> [ 1438.969776] mlx4_core :82:00.0: mlx4_handle_slave_flr: clean slave: 1
> [ 1439.070931] assign device 0:82:0.1
> [ 1439.073469] mlx4_core :82:00.0: FLR event for slave: 1
> [ 1439.073526] mlx4_core 0000:82:00.0: mlx4_handle_slave_flr
> [ 1439.073529] mlx4_core :82:00.0: mlx4_handle_slave_flr: clean slave: 1
> [ 1439.226723] mlx4_core :82:00.0: Received reset from slave:1
> [ 1439.239542] pci-stub :82:00.1: irq 180 for MSI/MSI-X
> [ 1439.272548] pci-stub :82:00.1: irq 180 for MSI/MSI-X
> [ 1439.272559] pci-stub :82:00.1: irq 181 for MSI/MSI-X
> [ 1449.381120] mlx4_core :82:00.0: Received reset from slave:1
> 
> 
> mlx4_core: Mellanox ConnectX core driver v1.1 (Dec, 2011)
> mlx4_core: Initializing :00:0b.0
> mlx4_core :00:0b.0: enabling device ( -> 0002)
> mlx4_core :00:0b.0: Detected virtual function - running in slave mode
> mlx4_core :00:0b.0: Sending reset
> mlx4_core :00:0b.0: Sending vhcr0
> mlx4_core :00:0b.0: 64B EQEs/CQEs supported by the device but not enabled
> mlx4_core :00:0b.0: HCA minimum page size:512
> mlx4_core :00:0b.0: irq 48 for MSI/MSI-X
> mlx4_core :00:0b.0: irq 49 for MSI/MSI-X
> mlx4_core 000

Re: Fwd: Error message when trying to use Infiniband virtual functions in virtual machine

2013-01-29 Thread Jack Morgenstein
On Tuesday 29 January 2013 15:15, Mathis GAVILLON wrote:
> In fact, one boxe wasn't check in IOMMU submenu (Support for Interrupt
> Remapping (EXPERIMENTAL)). I've recompiled the kernel of the host and
> guest (is it necessary for this last one ?). But the error message is
> already present. Just below, new dmeg log from host and guest O/S
> (mlx4_core loaded with options num_vfs=1 probe_vf=1 debug_level=1 on
> host and debug_level=1 on guest) :
> 
Please do the following:
1. Check the Support for Interrupt Remapping (EXPERIMENTAL) box and 
recompile/reinstall.
2. set probe_vf=0, so that no vfs will run on the Hypervisor.  That way you will
   see if any VF comes up without the "noise" in /var/log/messages in the 
Hypervisor generated
   by having one of the VFs run on the Hypervisor.

3. I noticed that you are enabling only a single VF, and that you are having the
   VF come up first on the Hypervisor, then moving it to run on a guest.
   It is better not to do things that way. Best to have it come up cleanly once 
on
   the guest. (the FLR event in your log results from this move.  The VF gets 
reset).

   If you must have a VF running on the Hypervisor, use "num_vfs=2 probe_vf=1".
   You will then see Devices ...xx.0, ...xx.1 and ...xx.2 in lspci.  The first 
VF
   (...xx.1) will be initialized on the Hypervisor.  Attach ...xx.2 to the VM.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Fwd: Error message when trying to use Infiniband virtual functions in virtual machine

2013-01-29 Thread Jack Morgenstein
On Tuesday 29 January 2013 15:15, Mathis GAVILLON wrote:
> In fact, one boxe wasn't check in IOMMU submenu (Support for Interrupt
> Remapping (EXPERIMENTAL)). I've recompiled the kernel of the host and
> guest (is it necessary for this last one ?)

Yes, it is necessary.  Without that checked, I noticed that SRIOV did not work 
properly.

-Jack
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Fwd: Error message when trying to use Infiniband virtual functions in virtual machine

2013-01-28 Thread Jack Morgenstein
nk is not ready
> [65190.061269] mlx4_core :82:00.1: mlx4_ib: multi-function enabled
> [65190.061271] mlx4_core :82:00.1: mlx4_ib: operating in qp1 tunnel mode
> [65190.071641] IPv6: ADDRCONF(NETDEV_UP): ib5: link is not ready
> 
> Here is dmesg command return of virtual machine (mlx4_core loaded with
> options debug_level=1) :
> 
> [root@welling0 ~]# dmesg
> mlx4_core: Mellanox ConnectX core driver v1.1 (Dec, 2011)
> mlx4_core: Initializing :00:08.0
> mlx4_core :00:08.0: Detected virtual function - running in slave mode
> mlx4_core :00:08.0: Sending reset
> mlx4_core :00:08.0: Sending vhcr0
> mlx4_core :00:08.0: BlueFlame not available
> mlx4_core :00:08.0: Base MM extensions: flags 00104cc0, rsvd L_Key 
> 8000
> mlx4_core :00:08.0: Max ICM size 4294967296 MB
> mlx4_core :00:08.0: Max QPs: 16777216, reserved QPs: 64, entry size: 256
> mlx4_core :00:08.0: Max SRQs: 16777216, reserved SRQs: 64, entry size: 128
> mlx4_core :00:08.0: Max CQs: 16777216, reserved CQs: 128, entry size: 128
> mlx4_core :00:08.0: Max EQs: 32, reserved EQs: 28, entry size: 128
> mlx4_core :00:08.0: reserved MPTs: 256, reserved MTTs: 32
> mlx4_core :00:08.0: Max PDs: 131072, reserved PDs: 4, reserved UARs: 7
> mlx4_core :00:08.0: Max QP/MCG: 131072, reserved MGMs: 0
> mlx4_core :00:08.0: Max CQEs: 4194304, max WQEs: 16384, max SRQ WQEs: 
> 16384
> mlx4_core :00:08.0: Local CA ACK delay: 15, max MTU: 4096, port width 
> cap: 2
> mlx4_core :00:08.0: Max SQ desc size: 1008, max SQ S/G: 62
> mlx4_core :00:08.0: Max RQ desc size: 512, max RQ S/G: 32
> mlx4_core :00:08.0: Max GSO size: 131072
> mlx4_core :00:08.0: Max counters: 256
> mlx4_core :00:08.0: Max RSS Table size: 256
> mlx4_core :00:08.0: DEV_CAP flags:
> mlx4_core :00:08.0: RC transport
> mlx4_core :00:08.0: UC transport
> mlx4_core :00:08.0: UD transport
> mlx4_core :00:08.0: XRC transport
> mlx4_core :00:08.0: FCoIB support
> mlx4_core :00:08.0: SRQ support
> mlx4_core :00:08.0: IPoIB checksum offload
> mlx4_core :00:08.0: P_Key violation counter
> mlx4_core :00:08.0: Q_Key violation counter
> mlx4_core :00:08.0:     DPDP
> mlx4_core :00:08.0: Big LSO headers
> mlx4_core :00:08.0: APM support
> mlx4_core :00:08.0: Atomic ops support
> mlx4_core :00:08.0: Address vector port checking support
> mlx4_core :00:08.0: UD multicast support
> mlx4_core :00:08.0: Router support
> mlx4_core :00:08.0: IBoE support
> mlx4_core :00:08.0: Unicast loopback support
> mlx4_core :00:08.0: FCS header control
> mlx4_core :00:08.0: UDP RSS support
> mlx4_core :00:08.0: Unicast VEP steering support
> mlx4_core :00:08.0: Multicast VEP steering support
> mlx4_core :00:08.0: Counters support
> mlx4_core :00:08.0: Port management change event support
> mlx4_core :00:08.0: 64 byte EQE support
> mlx4_core :00:08.0: 64 byte CQE support
> mlx4_core :00:08.0: RSS support
> mlx4_core :00:08.0: RSS Toeplitz Hash Function support
> mlx4_core :00:08.0: RSS XOR Hash Function support
> mlx4_core :00:08.0: Device manage flow steering support
> mlx4_core :00:08.0: 64B EQEs/CQEs supported by the device but not enabled
> mlx4_core :00:08.0: HCA minimum page size:512
> mlx4_core :00:08.0: Steering mode is: B0 steering
> mlx4_core :00:08.0: Failed to map blue flame area
> mlx4_core :00:08.0: irq 48 for MSI/MSI-X
> mlx4_core :00:08.0: irq 49 for MSI/MSI-X
> mlx4_core :00:08.0: failed execution of VHCR_POST commandopcode 0x31
> mlx4_core :00:08.0: NOP command failed to generate MSI-X interrupt IRQ 
> 49).
> mlx4_core :00:08.0: Trying again without MSI-X.
> mlx4_core: probe of :00:08.0 failed with error -16
> 
> Mathis
> 
> 
> 2013/1/24 Mathis GAVILLON :
> > -- Forwarded message --
> > From: Or Gerlitz 
> > Date: 2013/1/24
> > Subject: Re: Error message when trying to use Infiniband virtual
> > functions in virtual machine
> > To: Mathis GAVILLON 
> > Cc : linux-rdma@vger.kernel.org, Jack Morgenstein 
> >
> >
> > On Wed, Jan 23, 2013 at 5:29 PM, Mathis GAVILLON  wrote:
> >
> >> I'm working on a project in which I try to virtualize Infiniband. I use 
> >> ConnectX3 Infiniband cards with FW 2.9.1200.
> >> I've installed a Fedora 18 OS with 3.8.0rc2 recompiled kernel. I use KVM 
> >> 1.2 to run virtual machines.
> >
> > Your firmware version is a bit old... if you can upgrade to the latest
> > GA on the Mellanox site (2.11.0500) would be good,
> >
> > So what version of  qemu is installed on the distro you use? Are you
> > able to work over a ConnectX VF if probed
> > to the host? e.g use num_vfs=3 probe_vf=1 and see how the related mlx4
> > device (e.g mlx4_1) is functioning.
> >
> > Also would be of help if you probe mlx4_core on both VM and host with
> > debug_level=1
> >
> >
> > Or.
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/lmx4: silence GCC warning

2012-10-10 Thread Jack Morgenstein
You could use:

   u16 uninitialized_var(vlan);

instead.

Although this in the special QP data flow, I still prefer to avoid adding extra 
code (even setting
initial values at procedure entry).  The line above will also do the job.  
"uninitialized_var"
is used elsewhere in the driver.  See, for example, mlx4_ib_post_send() in the 
same file (qp.c).

-Jack

On Friday 28 September 2012 14:48, Paul Bolle wrote:
> Building qp.o (part of the "Mellanox ConnectX HCA support" driver)
> triggers this GCC warning:
> drivers/infiniband/hw/mlx4/qp.c: In function ‘mlx4_ib_post_send’:
> drivers/infiniband/hw/mlx4/qp.c:1828:30: warning: ‘vlan’ may be used 
> uninitialized in this function [-Wmaybe-uninitialized]
> drivers/infiniband/hw/mlx4/qp.c:1718:6: note: ‘vlan’ was declared here
> 
> Looking at the code it is clear 'vlan' is only set and used if 'is_eth'
> is non-zero. But there's no harm in initializing 'vlan' to 0 (which
> matches ib_get_cached_gid()'s default return) to silence GCC.
> 
> Signed-off-by: Paul Bolle 
> ---
> 0) I noticed this warning while building v3.6-rc7 on current Fedora 17,
> using Fedora's default config.
> 
> 1) Compile tested only. I tested against v3.6-rc7, with commit
> a41262bb5721f2b708ee8b23f67be2f2e16a2fab ("IB/mlx4: SR-IOV IB context
> objects and proxy/tunnel SQP") from linux-next cherry-picked, to take
> into account a trivial context change in linux-next.
> 
>  drivers/infiniband/hw/mlx4/qp.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
> index a862251..71fdda6 100644
> --- a/drivers/infiniband/hw/mlx4/qp.c
> +++ b/drivers/infiniband/hw/mlx4/qp.c
> @@ -1715,7 +1715,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, 
> struct ib_send_wr *wr,
>   int is_eth;
>   int is_vlan = 0;
>   int is_grh;
> - u16 vlan;
> + u16 vlan = 0;
>   int err = 0;
>  
>   send_size = 0;
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] net/mlx4_core: Adjust the flow steering attach wrapper so that VFs work under SRIOV/IB

2012-10-03 Thread Jack Morgenstein
Currently, the Infiniband stack does not support flow steering at the
verbs level. As a result, the only usage of flow steering in the
IB driver is for L2 multicast attaches. Thus, we need to add the IB case to
procedure mlx4_QP_FLOW_STEERING_ATTACH_wrapper() to allow IPoIB to work on
VFs over ConnectX3 when flow steering is enabled.

Currently, the IB case in mlx4_QP_FLOW_STEERING_ATTACH_wrapper()
is missing, so the procedure returns -EINVAL and IPoIB on VFs fails to operate.

Signed-off-by: Jack Morgenstein 
Signed-off-by: Or Gerlitz 
---
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

Commit 7fb40f87... "net/mlx4_core: Add security check / enforcement for flow 
steering
rules set for VMs" left a hole which causes failures on multicast attaches done 
by SRIOV/IB
VFs over ConnectX3 when flow steering is enabled.  When it was submitted, 
SRIOV/IB was
not yet in the upstream kernel.

Additionally, the V2 patch set was submitted before 7fb40f87.., hence that 
commit needs
an adjustment for SRIOV/IB VF multicast attach support. 

diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c 
b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index ba6506f..926c911 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -3094,6 +3094,8 @@ int mlx4_QP_FLOW_STEERING_ATTACH_wrapper(struct mlx4_dev 
*dev, int slave,
if (validate_eth_header_mac(slave, rule_header, rlist))
return -EINVAL;
break;
+   case MLX4_NET_TRANS_RULE_ID_IB:
+   break;
case MLX4_NET_TRANS_RULE_ID_IPV4:
case MLX4_NET_TRANS_RULE_ID_TCP:
case MLX4_NET_TRANS_RULE_ID_UDP:
-- 
1.7.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] mlx4_core: Disable SENSE_PORT for multifunction devices

2012-10-02 Thread Jack Morgenstein
On Thursday 27 September 2012 22:58, Or Gerlitz wrote:
> On Thu, Sep 27, 2012 at 7:47 PM, Roland Dreier  wrote:
> > From: Roland Dreier 
> >
> > In the current driver, the SENSE_PORT firmware command is issued as a
> > "wrapped" command, but the command handling code doesn't have a
> > wrapper, so it will never do anything other than log an error message.
> > The latest ConnectX-3 2.11.500 firmware reports the SENSE_PORT
> > capability even in multi-function (SR-IOV) mode, so the driver will
> > try to issue the command.
> >
> > At least until the driver has a proper wrapper for SENSE_PORT, make
> > sure we disable the command for multi-function devices.
> 
> makes sense, nice doing!
> 
> Acked-by: Or Gerlitz  for patches 1-3
> 
> Roland, I see that these three patches are queued @ your for-next and
> also the initial
> patch which in a way is more lengthy and heavy.  I wonder whether
> wouldn't it be fare to allow for Jack to review it before pushing?
> Jack is back by Tuesday.
>
Acked-by: Jack Morgenstein  for patches 1-3

Nice going, Roland!

-Jack

> Or.
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] mlx4_core: Fix crash on uninitialized priv->cmd.slave_sem

2012-10-02 Thread Jack Morgenstein
Acked-by: Jack Morgenstein 

Thanks, Roland! Good catches and good fixes!

Regarding the mutex replacing the semaphore, at one time we toyed
with the idea of multiple comm channel commands "in the air", but
we did not pursue the idea.

I agree with changing slave_sem to a mutex.  If we do decide at some point
to do multiple comm channel commands, we will change it back.

Regarding moving initialization of slave_sem and vhcr, your later patches
render it unnecessary.  However, I agree with this change -- this fixes
an "Oops waiting to happen". Though the change is no longer needed now, it
will be if SENSE_PORT is allowed in multifunction, and then we WILL start
experiencing the same Oops as we did before your patches disabled
SENSE_PORT for multifunction. Fixes like these are too easy to overlook.

-Jack

On Wednesday 26 September 2012 06:42, Roland Dreier wrote:
> From: Roland Dreier 
> 
> On an SR-IOV master device, __mlx4_init_one() calls mlx4_init_hca()
> before mlx4_multi_func_init().  However, for unlucky configurations,
> mlx4_init_hca() might call mlx4_SENSE_PORT() (via mlx4_dev_cap()), and
> that calls mlx4_cmd_imm() with MLX4_CMD_WRAPPED set.
> 
> However, on a multifunction device with MLX4_CMD_WRAPPED, __mlx4_cmd()
> calls into mlx4_slave_cmd(), and that immediately tries to do
> 
>   down(&priv->cmd.slave_sem);
> 
> but priv->cmd.slave_sem isn't initialized until mlx4_multi_func_init()
> (which we haven't called yet).  The next thing it tries to do is access
> priv->mfunc.vhcr, but that hasn't been allocated yet.
> 
> Fix this by moving the initialization of slave_sem and vhcr up into
> mlx4_cmd_init(). Also, since slave_sem is really just being used as a
> mutex, convert it into a slave_cmd_mutex.
> 
> Signed-off-by: Roland Dreier 
> ---
> Jack, I needed this to get my (CX3 w/ FW 2.11.500) adapter to work in
> SR-IOV mode.  Is it possible you never tested SR-IOV on an adapter
> with ports in autosensing mode?
> 
>  drivers/net/ethernet/mellanox/mlx4/cmd.c  |   51 
> ++---
>  drivers/net/ethernet/mellanox/mlx4/main.c |   11 ---
>  drivers/net/ethernet/mellanox/mlx4/mlx4.h |2 +-
>  3 files changed, 38 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
> b/drivers/net/ethernet/mellanox/mlx4/cmd.c
> index 90774b7..3d1899f 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
> @@ -395,7 +395,8 @@ static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 
> in_param, u64 *out_param,
>   struct mlx4_vhcr_cmd *vhcr = priv->mfunc.vhcr;
>   int ret;
>  
> - down(&priv->cmd.slave_sem);
> + mutex_lock(&priv->cmd.slave_cmd_mutex);
> +
>   vhcr->in_param = cpu_to_be64(in_param);
>   vhcr->out_param = out_param ? cpu_to_be64(*out_param) : 0;
>   vhcr->in_modifier = cpu_to_be32(in_modifier);
> @@ -403,6 +404,7 @@ static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 
> in_param, u64 *out_param,
>   vhcr->token = cpu_to_be16(CMD_POLL_TOKEN);
>   vhcr->status = 0;
>   vhcr->flags = !!(priv->cmd.use_events) << 6;
> +
>   if (mlx4_is_master(dev)) {
>   ret = mlx4_master_process_vhcr(dev, dev->caps.function, vhcr);
>   if (!ret) {
> @@ -439,7 +441,8 @@ static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 
> in_param, u64 *out_param,
>   mlx4_err(dev, "failed execution of VHCR_POST command"
>"opcode 0x%x\n", op);
>   }
> - up(&priv->cmd.slave_sem);
> +
> + mutex_unlock(&priv->cmd.slave_cmd_mutex);
>   return ret;
>  }
>  
> @@ -1559,14 +1562,15 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, 
> int slave, u8 cmd,
>   if ((slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_EN) &&
>   (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST))
>   goto reset_slave;
> - down(&priv->cmd.slave_sem);
> +
> + mutex_lock(&priv->cmd.slave_cmd_mutex);
>   if (mlx4_master_process_vhcr(dev, slave, NULL)) {
>   mlx4_err(dev, "Failed processing vhcr for slave:%d,"
>" resetting slave.\n", slave);
> - up(&priv->cmd.slave_sem);
> + mutex_unlock(&priv->cmd.slave_cmd_mutex);
>   goto reset_slave;
>   }
> - up(&priv->cmd.slave_sem);
> + mutex_unlock(&priv->cmd.slave_cmd_mutex);
>   bre

Re: [PATCH for-next V2 01/22] IB/core: Reserve bits in enum ib_qp_create_flags for low-level driver use

2012-09-25 Thread Jack Morgenstein
Thanks, Roland!

-Jack

On Monday 24 September 2012 21:34, Roland Dreier wrote:
> So I applied this whole series, with the plan to merge this for 3.7.
> 
> Please send any changes as patches on top of what's already merged.
> 
> Thanks,
>   Roland
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: linux-next: build failure after merge of the akpm tree

2012-09-25 Thread Jack Morgenstein
Hi Roland,
I am on vacation until next Tuesday -- I'll look at this then.

-Jack


On Monday 24 September 2012 21:36, Roland Dreier wrote:
> On Mon, Sep 24, 2012 at 7:02 AM, Stephen Rothwell  
> wrote:
> > After merging the akpm tree, today's linux-next build (powerpc
> > ppc64_defconfig) failed like this:
> >
> > drivers/infiniband/hw/mlx4/cm.c: In function 'id_map_alloc':
> > drivers/infiniband/hw/mlx4/cm.c:228:36: error: 'MAX_ID_MASK' undeclared 
> > (first use in this function)
> >
> > Caused by commit d7a4e9b679e9 ("IB/mlx4: Add CM paravirtualization") from
> > the infiniband tree interacting with commit "idr: rename MAX_LEVEL to
> > MAX_IDR_LEVEL" from the akpm tree.
> >
> > I have added the following merge fix patch for today:
> >
> > From: Stephen Rothwell 
> > Date: Mon, 24 Sep 2012 23:57:53 +1000
> > Subject: [PATCH] IB/mlx4: fix for MAX_ID_MASK to MAX_IDR_MASK name change
> >
> > Signed-off-by: Stephen Rothwell 
> > ---
> >  drivers/infiniband/hw/mlx4/cm.c |2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/infiniband/hw/mlx4/cm.c 
> > b/drivers/infiniband/hw/mlx4/cm.c
> > index e25e4da..80079e5 100644
> > --- a/drivers/infiniband/hw/mlx4/cm.c
> > +++ b/drivers/infiniband/hw/mlx4/cm.c
> > @@ -225,7 +225,7 @@ id_map_alloc(struct ib_device *ibdev, int slave_id, u32 
> > sl_cm_id)
> > ret = idr_get_new_above(&sriov->pv_id_table, ent,
> > next_id, &id);
> > if (!ret) {
> > -   next_id = ((unsigned) id + 1) & MAX_ID_MASK;
> > +   next_id = ((unsigned) id + 1) & MAX_IDR_MASK;
> > ent->pv_cm_id = (u32)id;
> > sl_id_map_add(ibdev, ent);
> > }
> 
> Andrew, any preference on how to handle this merge?
> 
> Jack/Amir, I wonder if there's some way we can avoid this code
> entirely?  Is an IDR the right structure to use here, or would we
> be better off with a radix tree maybe (where we can assign our
> own ID)?
> 
>  - R.
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH for-next V2 04/22] IB/mlx4: SRIOV IB context objects and proxy/tunnel sqp support

2012-09-21 Thread Jack Morgenstein
On Friday 21 September 2012 02:15, Or Gerlitz wrote:
> On Tue, Sep 11, 2012 at 8:10 PM, Doug Ledford  wrote:
> > On 8/3/2012 4:40 AM, Jack Morgenstein wrote:
> > > struct mlx4_ib_sriov{} is created by the master only.
> > > It is a container for the following:
> > > 1. All the info required by the PPF to multiplex and de-multiplex MADs
> > >(including those from the PF). (struct mlx4_ib_demux_ctx demux)
> >
> > OK, so can we have at least a single reference to the various
> > abbreviations before using them exclusively?  I know PF and PPF may be
> > common, but it might be nice that they were used once in full form
> > before abbreviated in commit messages.
> 
> PF is physical function and PPF primary physical function
I'll do that during the review/resubmission process
> 
> >
> > > 2. All the info required to manage alias GUIDs (i.e., the GUID at
> > >index 0 that each guest perceives.  In fact, this is not the
> > >GUID which is actually at index 0, but is, in fact, the GUID
> > >which is at index[] in the physical table.
> >
> > OK, this has been one of the things that has made reviewing this
> > difficult.  I freely admit that I've steadfastly ignored SRIOV for as
> > long as I can, so maybe this is just me.  But, in the context of this
> > driver, how am I supposed to know which code paths will be on the host
> > and which on the guest?
> 
> For the mlx4 driver the approach taken was to para-virtualize mlx4_core
> such that both the PPF and VFs run the same driver but within that
> driver two flows are operative. In mlx4_core it should be pretty clear
> to see where are you now, in mlx4_ib sometimes less easy, I'll leave
> that to Jack
> to address with more details.
> 
The flows are indeed complex. The issue we have is that there is a single driver
for both "Native" (i.e., non-sriov) mode, for the SRIOV master (PF/PPF) and for 
SRIOV slaves (VFs).

We have macros to assist in routing these flows:

mlx4_is_mfunc() -- test for multifunc active (i.e, we are in an sriov master or 
an sriov slave).
mlx4_is_master() -- we are an sriov master.  False if we are an SRIOV slave, or 
if we are in Native mode.
mlx4_is_slave() -- we are an SRIOV slave.  False if we are an SRIOV master, or 
if we are in Native mode.

You will see 
if (!mlx4_is_master)
return;

  in various procedures, to indicate that what follows is code that only an 
SRIOV master should execute.

For example, look at procedure 
void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port, int i, int 
val)
{
struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);

if (!mlx4_is_master(dev))
return;

priv->virt2phys_pkey[slave][port - 1][i] = val;
}

You will also see the tests in combination (e.g., if (!mlx4_is_master && !) do something)

Finally, you will see lots of places where the test is positive, around a block 
of code:

if (mlx4_is_master()) {
do something;
}

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH for-next V2 02/22] IB/core: change pkey table lookups to support full and partial membership for the same pkey

2012-09-13 Thread Jack Morgenstein
On Wednesday 12 September 2012 19:48, Doug Ledford wrote:
> > On the Hypervisor, however, we assume that if both versions of the pkey are 
> > in its pkey table,
> > then for its own infiniband operation (as opposed to performing its pkey 
> > virtualizing function),
> > it should operate with the highest membership type in its table for a given 
> > 15-bit pkey.
> 
> That's what I was looking for.  So, how can you know this assumption is
> correct?  It seems to me that if someone wanted to restrict membership
> of the hypervisor as part of a security lockdown, then give full
> membership to a guest because that guest is some high security, single
> task guest, then this assumption would break things (the user would be
> able to assign the full membership key to the guest OK, but regardless
> of how they wanted the hypervisor to be subscribed to that particular
> pkey, it would always get the full membership from the guest).
> 
This issue, unfortunately, opens up a real "can of worms".
ib_find_cached_pkey() is used by the CM in determining the (15-bit) p_key
to be used for the connection (although 16-bit pkeys are placed in the
CM_REQ message (see IB Spec 1.2.1, table 99, page 667). The REQ handler on
the remote side finds the index in its pkey table which contains the
(15-bit) pkey enclosed in the REQ message.  This index is then used
when creating the local RC qp as its pkey_index.
AFAIK, no check is performed regarding compatible membership forms (i.e.,
that at least one of the two sides has the full membership form of the
15-bit pkey) -- it is the network administrator's responsibility to see
that the pkey configurations are correct.

Up to now, the ib_core driver has assumed that only 1 membership form per
15-bit pkey is contained in the local pkey table.

Now that both forms may exist in the Hypervisor's table, we do have a problem.
1. We cannot depend on ib_find_cached_pkey simply finding the first 15-bit pkey
   which matches its search, since we cannot depend on the order of full vs 
limited
   membership forms appearing in the pkey table (the order is not controllable 
-- see
   the example at the end of this post).
2. We therefore need a consistent policy for preferentially retrieving either 
the full
   or the partial member for a given 15-bit pkey.
3. This policy must be configurable by the administrator per host, per HCA.

The problems that I see:
1. What if the Hypervisor needs to have limited membership for some 
connections, but
   full membership for others?  Such a split policy is complex to implement -- 
it would
   require specifying PER PKEY whether the preferential return should be full, 
or should
   be limited.
2. What if the Hypervisor has several HCAs, and wishes a different policy for 
each HCA?

There may be more problems as well.  Liran?

In any event, the issue of multiple pkey forms in a given pkey table is 
actually separate
from SRIOV (it's just that SRIOV needs multiple forms, so the issue came up).
Solving it will require changes in the ib core driver.

I don't yet have a concrete proposal to fix this at present.  Any ideas would 
be appreciated.

-Jack

P.S. We have the same issue in procedure ib_find_pkey(), also in this patch, in 
core/device.c .
This procedure is used in lots of places: ipoib, core/multicast, sa_query.

I seem to recall that there were problems with IPoIB when partial membership 
pkeys are used.
Liran, do you recall something?

==
It is also impossible to demand that the pkey table be ordered so that if both 
limited
and full pkeys are present, then the full membership value always appears first.

A simple example:
  1. Fill the entire pkey table with full-membership pkeys. Say pkey A and pkey 
B are both
 in the table, and that pkey A appears first.

  2. Delete full-membership pkey A, and add the limited-membership value of 
Pkey B. This
 new limited-pkey-B will be entered into the pkey table by OpenSM in the 
position just
 vacated by the deleted pkey A -- it is the only available slot!.

  3. The Limited-membership pkey-B value will therefore unavoidably appear in 
the table
 before the full-membership pkey-B value.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH for-next V2 02/22] IB/core: change pkey table lookups to support full and partial membership for the same pkey

2012-09-12 Thread Jack Morgenstein
On Tuesday 11 September 2012 19:52, Doug Ledford wrote:
> On 8/3/2012 4:40 AM, Jack Morgenstein wrote:
> > Enhance the cached and non-cached pkey table lookups to enable limited and 
> > full
> > members of the same pkey to co-exist in the pkey table.
> > 
> > This is necessary for SRIOV to allow for a scheme where some guests would 
> > have the full
> > membership pkey in their virtual pkey table, where other guests on the same 
> > hypervisor
> > would have the limited one. In that sense, its an extension of the IBTA 
> > model for
> > non virtualized nodes.
> 
> OK, maybe I'm not getting something, but I'm curious why we always pick
> the full pkey in preference to the partial pkey.
> 
The pkey model that we are working with is the following:
- guests do NOT have both limited and full membership forms of a given pkey;
  they have either the limited member or the full member

- The guests will see a virtualized pkey table, where pkeys appearing in that 
table
  are "cherry-picked" by the administrator from the underlying physical 
(device) pkey
  table contents.  The slot (or index) occupied by a mapped pkey in the virtual 
table
  is also determined by the administrator.
  The Hypervisor (primary phys function or PPF), which sees the physical pkey 
table used
  for the mappings, will therefor have both forms of a given pkey -- with the 
limited form
  mapped to some slot in a guest's pkey table, and the full form mapped to some 
slot in
  a different guest's pkey table.

- The only reason that a pkey table would contain both full and limited 
membership versions
  of a given pkey is to support SRIOV guest pkey paravirtualization.  In 
"native" mode, there
  is no reason to have both forms in a given pkey table. (The driver clearly 
reflects this
  in its original implementation of "ib_find_cached_pkey" -- it returns the 
index of the first
  15-bit pkey it finds in the table, and assumes that these 15 bits are unique 
in the table).

Thus, on guests ib_find_cached_pkey will operate as before, since the 15-bit 
pkey in the guest
table will be unique.

On the Hypervisor, however, we assume that if both versions of the pkey are in 
its pkey table,
then for its own infiniband operation (as opposed to performing its pkey 
virtualizing function),
it should operate with the highest membership type in its table for a given 
15-bit pkey.

> Shouldn't we pick the
> pkey that's appropriate for the vHCA sending the message?

We do. When QPs on the guests are created, the modify-qp commands are not 
executed on the guest,
but rather are passed to the PPF for processing. The PPF replaces the 
guest-provided virtual pkey-index
value with the appropriate physical pkey-index value.  See procedure 
"update_pkey_index" in file
resource_tracker.c, and all the places it is called (i.e., in the wrapper 
functions for the various
modify-qp firmware commands).

> Also, given the rule of least surprise, don't you think it would be best
> to rename this function ib_find_cached_full_or_parital_pkey and in your
> next patch instead of naming it ib_find_exact_pkey just call that one
> ib_find_cached_pkey?
The naming of "ib_find_cached_pkey" refers to the 15-bit pkey value. I also
did not want to change all the places that "ib_find_cached_pkey" is used, to
call "ib_find_cached_full_or_partial_pkey" instead.

I also think it is more confusing to use the old name "ib_find_cached_pkey" in
a completely new (and incompatible) way.

Maybe I can change "ib_find_exact_pkey" to "ib_find_16_bit_pkey" ?

In this case, "ib_find_cached_pkey" will be backwards compatible and will find 
the
appropriate 15-bit pkey of either form (with the proviso that if both forms of 
the
pkey are in the table, it will return the full membership pkey index).

The ib_find_16_bit_pkey() looks for the exact match, and returns error if it 
does not find
it (-- specifically, if it is looking for one membership form, and only the 
other form is in
the table, it will return error).

> > To accomplish this, we need both the limited and full membership pkeys to 
> > be present
> > in the master's (hypervisor physical port) pkey table.
> > 
> > The algorithm for supporting pkey tables which contain both the limited and 
> > the full
> > membership versions of the same pkey works as follows:
> > 
> > When scanning the pkey table for a 15 bit pkey:
> > 
> > A. If there is a full member version of that pkey anywhere
> > in the table, return its index (even if a limited-member
> > version of the pkey exists earlier in the table).
> > 
> > B. If the full member version is not in the table,
> > but the limited-member ver

Re: [PATCH RFC for-next] net/mlx4_core: Fix racy flow in the driver CQ completion handler

2012-09-10 Thread Jack Morgenstein
On Monday 10 September 2012 16:27, Or Gerlitz wrote:
> I  took a look on the practice/wrapping used over the mm subsystem for 
> radix_tree_lookup calls, whose maintainer,
> Andrew Morton is signed on the patch Roland pointed to, its just 
> rcu_read_lock/unlock, seems this is what to do as well.
> 
In addition, need to do a synchronize_rcu when deleting, per
the comment in include/linux/rcupdate.h:

* It is still required that the caller manage the synchronization and lifetimes
* of the items. So if RCU lock-free lookups are used, typically this would mean
* that the items have their own locks, or are amenable to lock-free access; and
* that the items are freed by RCU (or only freed after having been deleted from
* the radix tree *and* a synchronize_rcu() grace period).

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC for-next] net/mlx4_core: Fix racy flow in the driver CQ completion handler

2012-09-10 Thread Jack Morgenstein
On Monday 10 September 2012 13:35, Or Gerlitz wrote:
> Jack,  Max
> 
> Actually, can't we do well with rcu_read_lock() in mlx4_cq_completion() 
> as that commit documentation suggests?
> 
I don't know. I do notice (in file include/linux/rcupdate.h) that 
rcu_read_lock/unlock
is meant to be used in the interrupt context. Would it be sufficient (besides 
rcu_read_lock/unlock calls) to add
a call rcu_synchronize() in mlx4_cq_free (after calling synchronize_irq)?

Could we also then dispense with the spinlocks in mlx4_cq_event() as well?

Acquiring the SINGLE cq_table->lock spinlock for EVERY completion event
of EVERY cq seems very nasty to me (probably why Roland did not do this), and 
it would
clearly be desirable not to have to do this. 

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC for-next] net/mlx4_core: Fix racy flow in the driver CQ completion handler

2012-09-09 Thread Jack Morgenstein
On Sunday 09 September 2012 18:10, Roland Dreier wrote:
> 
> Please look at commit 7cf9c2c76c1a ("[PATCH] radix-tree: RCU lockless 
> readside")
> 
Roland,

What about the following note (from the commit diff mentioned above):
+/**
+ * Radix-tree synchronization
+ *
+ * The radix-tree API requires that users provide all synchronisation (with
+ * specific exceptions, noted below).
+ *
+ * Synchronization of access to the data items being stored in the tree, and
+ * management of their lifetimes must be completely managed by API users.
+ *
+ * For API usage, in general,
+ * - any function _modifying_ the the tree or tags (inserting or deleting
+ *   items, setting or clearing tags must exclude other modifications, and
+ *   exclude any functions reading the tree.
+ * - any function _reading_ the the tree or tags (looking up items or tags,
+ *   gang lookups) must exclude modifications to the tree, but may occur
+ *   concurrently with other readers.
+ *
+ * The notable exceptions to this rule are the following functions:
+ * radix_tree_lookup
+ * radix_tree_tag_get
+ * radix_tree_gang_lookup
+ * radix_tree_gang_lookup_tag
+ * radix_tree_tagged
+ *

=== JPM -- Note the following text! =
+ * The first 4 functions are able to be called locklessly, using RCU. The
+ * caller must ensure calls to these functions are made within rcu_read_lock()
+ * regions.
===

+ * Other readers (lock-free or otherwise) and modifications may be
+ * running concurrently.

Roland, the above states explicitly that radix_tree_lookup should be called 
only under rcu_read_lock()!
This is NOT the case in our driver, in mlx4_eq_int()/mlx4_cq_completion() .

Looks to me, then, that the spinlock in mlx4_cq_completion() is required.

- Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH for-next V2 01/22] IB/core: Reserve bits in enum ib_qp_create_flags for low-level driver use

2012-09-06 Thread Jack Morgenstein
Thanks, Doug!

-Jack

On Wednesday 05 September 2012 17:55, Doug Ledford wrote:
> On 8/3/2012 4:40 AM, Jack Morgenstein wrote:
> > Reserve bits 26-31 for internal use by low-level drivers. Two
> > such bits are used in the mlx4 driver SRIOV IB implementation.
> > 
> > These enum additions guarantee that the core layer will never use
> > these bits, so that low level drivers may safely make use of them.
> > 
> > Signed-off-by: Jack Morgenstein 
> > ---
> >  include/rdma/ib_verbs.h |3 +++
> >  1 files changed, 3 insertions(+), 0 deletions(-)
> > 
> > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> > index 07996af..46bc045 100644
> > --- a/include/rdma/ib_verbs.h
> > +++ b/include/rdma/ib_verbs.h
> > @@ -614,6 +614,9 @@ enum ib_qp_type {
> >  enum ib_qp_create_flags {
> > IB_QP_CREATE_IPOIB_UD_LSO   = 1 << 0,
> > IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK   = 1 << 1,
> > +   /* reserve bits 26-31 for low level drivers' internal use */
> > +   IB_QP_CREATE_RESERVED_START = 1 << 26,
> > +   IB_QP_CREATE_RESERVED_END   = 1 << 31,
> >  };
> >  
> >  struct ib_qp_init_attr {
> > 
> 
> Reserving 6 bits for driver use out of 32 seems reasonable.
> 
> Acked-by: Doug Ledford 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Patchwork back online

2012-08-14 Thread Jack Morgenstein
On Tuesday 14 August 2012 19:20, Roland Dreier wrote:
> On Tue, Aug 14, 2012 at 7:56 AM, Jack Morgenstein
>  wrote:
> > I just checked patchwork, and do not see the bug-fix patch I submitted on 
> > Aug 3:
> > [PATCH] IB/mlx4: fix possible deadlock with sm_lock spinlock
> >
> > http://marc.info/?l=linux-rdma&m=134398272328023&w=2
> >
> > Please don't let this fall by the wayside -- it is independent of the 
> > SRIOV-IB patch set.
> 
> It's not in patchwork because I already applied it.
> 
>  - R.
> 
Sorry about that.  Thanks!

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Patchwork back online

2012-08-14 Thread Jack Morgenstein
On Tuesday 10 July 2012 20:08, Roland Dreier wrote:
> Not sure if everyone saw this, but thanks to the kernel.org crew,
> https://patchwork.kernel.org/project/linux-rdma/list/ is back online.
> 
> I've been delegating opensm patches to Alex Netes and diags
> patches to Ira Weiny.  Can someone refresh my memory about
> who looks after libibumad?  Is that Alex too?
> 
> Anyway, Alex and Ira, if you guys update patchwork when you
> apply or reject a patch, that would be great ... I'll try to avoid
> letting a big backlog of patches accumulate there too :)
> 
> Thanks,
>   Roland
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
Hi Roland,

I just checked patchwork, and do not see the bug-fix patch I submitted on Aug 3:
[PATCH] IB/mlx4: fix possible deadlock with sm_lock spinlock

http://marc.info/?l=linux-rdma&m=134398272328023&w=2

Please don't let this fall by the wayside -- it is independent of the SRIOV-IB 
patch set.

Thanks!
-Jack

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 3.6-rc1 IB complaint

2012-08-08 Thread Jack Morgenstein
Hi Bart,

I submitted a patch to Roland on August 3 (along with SRIOV-IB V2) to fix this:
 
[PATCH] IB/mlx4: fix possible deadlock with sm_lock spinlock

I notice that you tested out the fix and it worked.

Roland, please take the patch and submit to Linus. This fixes a bug in
the upstream 3.6-RC1 code.

Thanks!

-Jack

On Tuesday 07 August 2012 19:48, Bart Van Assche wrote:
> Hello,
> 
> Has anyone else already seen the ugly kernel message below ? This
> message is generated during boot and prevents my IB HCA to come up
> properly with 3.6-rc1. This did not happen with kernel 3.5.
> 
> =
> [ INFO: inconsistent lock state ]
> 3.6.0-rc1-debug+ #1 Not tainted
> -
> inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
> swapper/1/0 [HC1[1]:SC0[0]:HE0:SE1] takes:
>  (&(&ibdev->sm_lock)->rlock){?.+...}, at: [] 
> update_sm_ah+0x94/0xd0 [mlx4_ib]
> {HARDIRQ-ON-W} state was registered at:
>   [] __lock_acquire+0x66a/0x1ca0
>   [] lock_acquire+0x95/0x130
>   [] _raw_spin_lock+0x45/0x80
>   [] mlx4_ib_process_mad+0x58b/0x7a0 [mlx4_ib]
>   [] ib_post_send_mad+0x34e/0x6d0 [ib_mad]
>   [] ib_umad_write+0x515/0x630 [ib_umad]
>   [] vfs_write+0xce/0x170
>   [] sys_write+0x54/0xa0
>   [] system_call_fastpath+0x16/0x1b
> irq event stamp: 306104
> hardirqs last  enabled at (306101): [] mwait_idle+0x95/0x180
> hardirqs last disabled at (306102): [] 
> common_interrupt+0x67/0x6c
> softirqs last  enabled at (306104): [] 
> _local_bh_enable+0x13/0x20
> softirqs last disabled at (306103): [] irq_enter+0x75/0x90
> 
> other info that might help us debug this:
>  Possible unsafe locking scenario:
> 
>CPU0
>
>   lock(&(&ibdev->sm_lock)->rlock);
>   
> lock(&(&ibdev->sm_lock)->rlock);
> 
>  *** DEADLOCK ***
> 
> 1 lock held by swapper/1/0:
>  #0:  (&(&priv->ctx_lock)->rlock){-.}, at: [] 
> mlx4_dispatch_event+0x39/0x90 [mlx4_core]
> 
> stack backtrace:
> Pid: 0, comm: swapper/1 Not tainted 3.6.0-rc1-debug+ #1
> Call Trace:
>[] print_usage_bug+0x219/0x220
>  [] mark_lock+0x36f/0x3f0
>  [] __lock_acquire+0x80a/0x1ca0
>  [] lock_acquire+0x95/0x130
>  [] ? update_sm_ah+0x94/0xd0 [mlx4_ib]
>  [] ? rdma_port_get_link_layer+0x1b/0x40 [ib_core]
>  [] _raw_spin_lock+0x45/0x80
>  [] ? update_sm_ah+0x94/0xd0 [mlx4_ib]
>  [] ? ib_create_ah+0x1a/0x40 [ib_core]
>  [] update_sm_ah+0x94/0xd0 [mlx4_ib]
>  [] handle_port_mgmt_change_event+0xeb/0x150 [mlx4_ib]
>  [] mlx4_ib_event+0x120/0x170 [mlx4_ib]
>  [] ? _raw_spin_lock_irqsave+0x83/0xa0
>  [] ? mlx4_dispatch_event+0x39/0x90 [mlx4_core]
>  [] mlx4_dispatch_event+0x6c/0x90 [mlx4_core]
>  [] mlx4_eq_int+0x4d0/0x920 [mlx4_core]
>  [] ? local_clock+0x4f/0x60
>  [] mlx4_msi_x_interrupt+0x14/0x20 [mlx4_core]
>  [] handle_irq_event_percpu+0x75/0x230
>  [] handle_irq_event+0x4e/0x80
>  [] handle_edge_irq+0x85/0x130
>  [] handle_irq+0x25/0x40
>  [] do_IRQ+0x5d/0xe0
>  [] common_interrupt+0x6c/0x6c
>[] ? mwait_idle+0x9e/0x180
>  [] ? mwait_idle+0x95/0x180
>  [] cpu_idle+0xa6/0xe0
>  [] start_secondary+0x204/0x206
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH for-next V2 09/22] {NET,IB}/mlx4: MAD_IFC paravirtualization

2012-08-03 Thread Jack Morgenstein
The MAD_IFC firmware command fulfills two functions.

In the first case, it is used in the QP0/QP1 MAD-handling flow to
obtain information from the FW (for answering queries), and for
setting variables in the HCA (MAD SET packets).

In this function, MAD_IFC should provide the FW (physical) view
of the data.  This is the view that OpenSM needs for its functioning.
We call this the "network view".

In the second case, MAD_IFC is used by various verbs to obtain data
regarding the local HCA (e.g., ib_query_device() ).
We call this the "host view".

This data needs to be paravirtualized.

MAD_IFC thus needs a wrapper function, and also needs another flag
indicating whether it should provide the network view (when it is
called by ib_process_mad in special-qp packet handling), or the
host view (when it is called while implementing a verb).

There are currently 2 flag parameters in mlx4_MAD_IFC already:
ignore_bkey and ignore_mkey.  These two parameters are replaced
by a single "mad_ifc_flags" parameter, with different bits set
for each flag.  A third flag is added: "network-view/host-view".

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/mad.c |   20 ++--
 drivers/infiniband/hw/mlx4/main.c|   64 +---
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   14 +++-
 drivers/net/ethernet/mellanox/mlx4/cmd.c |  162 ++
 4 files changed, 234 insertions(+), 26 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index a392a5c..4c8650f 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -75,7 +75,7 @@ struct mlx4_rcv_tunnel_mad {
struct ib_mad mad;
 } __packed;
 
-int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
 void *in_mad, void *response_mad)
 {
@@ -102,10 +102,13 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int 
ignore_mkey, int ignore_bkey,
 * Key check traps can't be generated unless we have in_wc to
 * tell us where to send the trap.
 */
-   if (ignore_mkey || !in_wc)
+   if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc)
op_modifier |= 0x1;
-   if (ignore_bkey || !in_wc)
+   if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc)
op_modifier |= 0x2;
+   if (mlx4_is_mfunc(dev->dev) &&
+   (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc))
+   op_modifier |= 0x8;
 
if (in_wc) {
struct {
@@ -138,10 +141,10 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int 
ignore_mkey, int ignore_bkey,
in_modifier |= in_wc->slid << 16;
}
 
-   err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma,
-  in_modifier, op_modifier,
+   err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, 
in_modifier,
+  mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : 
op_modifier,
   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
-  MLX4_CMD_NATIVE);
+  (op_modifier & 0x8) ? MLX4_CMD_NATIVE : 
MLX4_CMD_WRAPPED);
 
if (!err)
memcpy(response_mad, outmailbox->buf, 256);
@@ -610,8 +613,9 @@ static int ib_process_mad(struct ib_device *ibdev, int 
mad_flags, u8 port_num,
prev_lid = pattr.lid;
 
err = mlx4_MAD_IFC(to_mdev(ibdev),
-  mad_flags & IB_MAD_IGNORE_MKEY,
-  mad_flags & IB_MAD_IGNORE_BKEY,
+  (mad_flags & IB_MAD_IGNORE_MKEY ? 
MLX4_MAD_IFC_IGNORE_MKEY : 0) |
+  (mad_flags & IB_MAD_IGNORE_BKEY ? 
MLX4_MAD_IFC_IGNORE_BKEY : 0) |
+  MLX4_MAD_IFC_NET_VIEW,
   port_num, in_wc, in_grh, in_mad, out_mad);
if (err)
return IB_MAD_RESULT_FAILURE;
diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index bdfd3ae..48cda35 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -98,7 +98,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
init_query_mad(in_mad);
in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
 
-   err = mlx4_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, 
out_mad);
+   err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS,
+  1, NULL, NULL, in_mad, out_mad);
if (err)
goto out;
 
@@ -182,11 +183,12 @@ mlx4_ib_port_link_layer(struct ib_device *device, u8 
port_num)
 }
 
 static int ib_link_query_port(struct ib_device *ibdev, u8 port,
-   

[PATCH for-next V2 04/22] IB/mlx4: SRIOV IB context objects and proxy/tunnel sqp support

2012-08-03 Thread Jack Morgenstein
1. Introduce the basic sriov parvirtualization context objects
   for multiplexing and demultiplexing MADs.
2. Introduce support for the new proxy and tunnel QP types.

This patch introduces the objects required by the master
for managing QP paravirtualization for guests.

struct mlx4_ib_sriov{} is created by the master only.
It is a container for the following:
1. All the info required by the PPF to multiplex and de-multiplex MADs
   (including those from the PF). (struct mlx4_ib_demux_ctx demux)
2. All the info required to manage alias GUIDs (i.e., the GUID at
   index 0 that each guest perceives.  In fact, this is not the
   GUID which is actually at index 0, but is, in fact, the GUID
   which is at index[] in the physical table.

3. structures which are used to manage CM paravirtualization
4. structures for managing the real special QPs when running in
   sriov mode.
   The real SQPs are controlled by the PPF in this case.  All SQPs
   created and controlled by the ib core layer are proxy sqps

struct mlx4_ib_demux_ctx{} contains the information per port needed
to manage paravirtualization.
This includes:
1. All multicast paravirt info
2. All tunnel-qp paravirt info for the port.
3. GUID-table and GUID-prefix for the port
4. work queues.

struct mlx4_ib_demux_pv_ctx{} contains all the info for managing
the paravirtualized QPs for one slave/port.

struct mlx4_ib_demux_pv_qp{} contains the info need to run an
individual QP (either tunnel qp or real SQP).

Note:  We made use of the 2 most-significant bits in enum
mlx4_ib_qp_flags (based upon enum ib_qp_create_flags (ib_verbs.h))

We need these bits in the low-level driver for internal purposes.

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/cq.c  |   31 ++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  128 +++-
 drivers/infiniband/hw/mlx4/qp.c  |  616 ++
 include/linux/mlx4/device.h  |1 +
 include/linux/mlx4/qp.h  |3 +-
 5 files changed, 702 insertions(+), 77 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 6d4ef71..342fabd 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -547,6 +547,26 @@ static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 
checksum)
checksum == cpu_to_be16(0x);
 }
 
+static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, 
struct ib_wc *wc,
+  unsigned tail, struct mlx4_cqe *cqe)
+{
+   struct mlx4_ib_proxy_sqp_hdr *hdr;
+
+   ib_dma_sync_single_for_cpu(qp->ibqp.device,
+  qp->sqp_proxy_rcv[tail].map,
+  sizeof(struct mlx4_ib_proxy_sqp_hdr),
+  DMA_FROM_DEVICE);
+   hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr);
+   wc->pkey_index  = be16_to_cpu(hdr->tun.pkey_index);
+   wc->slid= be16_to_cpu(hdr->tun.slid_mac_47_32);
+   wc->sl  = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12);
+   wc->src_qp  = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFF;
+   wc->wc_flags   |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0;
+   wc->dlid_path_bits = 0;
+
+   return 0;
+}
+
 static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
struct mlx4_ib_qp **cur_qp,
struct ib_wc *wc)
@@ -559,6 +579,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
int is_error;
u32 g_mlpath_rqpn;
u16 wqe_ctr;
+   unsigned tail = 0;
 
 repoll:
cqe = next_cqe_sw(cq);
@@ -634,7 +655,8 @@ repoll:
mlx4_ib_free_srq_wqe(srq, wqe_ctr);
} else {
wq= &(*cur_qp)->rq;
-   wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+   tail  = wq->tail & (wq->wqe_cnt - 1);
+   wc->wr_id = wq->wrid[tail];
++wq->tail;
}
 
@@ -717,6 +739,13 @@ repoll:
break;
}
 
+   if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) {
+   if ((*cur_qp)->mlx4_ib_qp_type &
+   (MLX4_IB_QPT_PROXY_SMI_OWNER |
+MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
+   return use_tunnel_data(*cur_qp, cq, wc, tail, 
cqe);
+   }
+
wc->slid   = be16_to_cpu(cqe->rlid);
g_mlpath_rqpn  = be32_to_cpu(cqe->g_mlpath_rqpn);
wc->src_qp = g_mlpath_rqpn & 0xff;
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h 
b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index c136bb6..1248d57 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@

[PATCH for-next V2 15/22] IB/mlx4: Add iov directory in sysfs under the ib device

2012-08-03 Thread Jack Morgenstein
This directory is added only for the master -- slaves do
not have it.

The sysfs iov directory is used to manage and examine the port
pkey and guid paravirtualization.

Under iov/ports, the administrator may examine the gid and pkey
tables as they are present in the device (and as are seen in the
"network view" presented to the SM).

Under the iov/ directories, the admin may map
the index numbers in the physical tables (as under iov/ports)
to the paravirtualized index numbers that guests see.

Thus, for example, if the administrator, for port 1 on guest 2, say,
maps physical pkey index 10 to virtual index 1, that guest, whenever
it uses its pkey index 1 will actually be using the real pkey index 10.

Based on patch from Erez Shitrit 

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/Makefile |2 +-
 drivers/infiniband/hw/mlx4/alias_GUID.c |6 +-
 drivers/infiniband/hw/mlx4/mad.c|9 +
 drivers/infiniband/hw/mlx4/mcg.c|   67 +++
 drivers/infiniband/hw/mlx4/mlx4_ib.h|   43 ++
 drivers/infiniband/hw/mlx4/sysfs.c  |  794 +++
 6 files changed, 917 insertions(+), 4 deletions(-)
 create mode 100644 drivers/infiniband/hw/mlx4/sysfs.c

diff --git a/drivers/infiniband/hw/mlx4/Makefile 
b/drivers/infiniband/hw/mlx4/Makefile
index 31d4c8a..f4213b3 100644
--- a/drivers/infiniband/hw/mlx4/Makefile
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_MLX4_INFINIBAND)  += mlx4_ib.o
 
-mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o 
alias_GUID.o
+mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o 
alias_GUID.o sysfs.o
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c 
b/drivers/infiniband/hw/mlx4/alias_GUID.c
index 9db1581..c4252e7 100644
--- a/drivers/infiniband/hw/mlx4/alias_GUID.c
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -113,7 +113,7 @@ static __be64 get_cached_alias_guid(struct mlx4_ib_dev 
*dev, int port, int index
 }
 
 
-static ib_sa_comp_mask get_aguid_comp_mask_from_ix(int index)
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index)
 {
return IB_SA_COMP_MASK(4 + index);
 }
@@ -259,7 +259,7 @@ static void aliasguid_query_handler(int status,
/* Mark the record as not assigned, and let it
 * be sent again in the next work sched.*/
rec->status = MLX4_GUID_INFO_STATUS_IDLE;
-   rec->guid_indexes |= 
get_aguid_comp_mask_from_ix(i);
+   rec->guid_indexes |= 
mlx4_ib_get_aguid_comp_mask_from_ix(i);
}
} else {
   /* properly assigned record. */
@@ -337,7 +337,7 @@ static void invalidate_guid_record(struct mlx4_ib_dev *dev, 
u8 port, int index)
MLX4_GUID_NONE_ASSIGN == dev->sriov.alias_guid.
ports_guid[port - 1].all_rec_per_port[index].ownership)
continue;
-   comp_mask |= get_aguid_comp_mask_from_ix(i);
+   comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
}
dev->sriov.alias_guid.ports_guid[port - 1].
all_rec_per_port[index].guid_indexes = comp_mask;
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index ef9842d..ec64077 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -1959,6 +1959,11 @@ int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev)
mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n");
goto paravirt_err;
}
+   err = mlx4_ib_device_register_sysfs(dev);
+   if (err) {
+   mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n");
+   goto sysfs_err;
+   }
 
mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 
clients\n",
 dev->dev->caps.sqp_demux);
@@ -1985,6 +1990,9 @@ demux_err:
mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
--i;
}
+   mlx4_ib_device_unregister_sysfs(dev);
+
+sysfs_err:
mlx4_ib_destroy_alias_guid_service(dev);
 
 paravirt_err:
@@ -2015,5 +2023,6 @@ void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev)
 
mlx4_ib_cm_paravirt_clean(dev, -1);
mlx4_ib_destroy_alias_guid_service(dev);
+   mlx4_ib_device_unregister_sysfs(dev);
}
 }
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
index 1ee2e3a..3c3b54c 100644
--- a/drivers/infiniband/hw/mlx4/mcg.c
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -110,6 +110,7 @@ struct mcast_group {
__be64  last_req_tid;
 
charname[33]; /* MGID string */
+   struct device_attribute dentry;
 
/* refco

[PATCH for-next V2 21/22] {NET,IB}/mlx4: Modify proxy/tunnel QP mechanism so that guests do no calculations

2012-08-03 Thread Jack Morgenstein
Previously, the structure of a guest's proxy QPs followed the structure of the 
PPF
special qps (qp0 port 1, qp0 port 2, qp1 port 1, qp1 port 2, ...).
The guest then did offset calculations on the sqp_base qp number that the PPF 
passed
to it in QUERY_FUNC_CAP().

This is now changed so that the guest does no offset calculations regarding 
proxy or
tunnel QPs to use.  This change frees the PPF from needing to adhere to a 
specific
order in allocating proxy and tunnel QPs.

Now, QUERY_FUNC_CAP provides each port individually with its proxy qp0, proxy 
qp1,
tunnel qp0, and tunnel qp1 QP numbers, and these are used directly where 
required
(with no offset calculations).

To accomplish this change, several fields were added to the phys_caps structure
for use by the PPF and by non-SRIOV mode:
base_sqpn -- in non-sriov mode, this was formerly sqp_start.
base_proxy_sqpn -- the first physical proxy qp number -- used by PPF
base_tunnel_sqpn -- the first physical tunnel qp number -- used by PPF.

The current code in the PPF still adheres to the previous layout of sqp's, 
proxy-sqp's,
and tunnel-sqp's.  However, the PPF can change this layout without affecting
VF or (paravirtualized) PF code.

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/mad.c  |   12 +-
 drivers/infiniband/hw/mlx4/qp.c   |  104 +++--
 drivers/net/ethernet/mellanox/mlx4/fw.c   |  175 
 drivers/net/ethernet/mellanox/mlx4/fw.h   |   14 ++-
 drivers/net/ethernet/mellanox/mlx4/main.c |   61 --
 drivers/net/ethernet/mellanox/mlx4/qp.c   |   71 +
 include/linux/mlx4/device.h   |   16 ++-
 7 files changed, 292 insertions(+), 161 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index f7b9a41..31b0559 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -501,7 +501,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int 
slave, u8 port,
} else
tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
 
-   dqpn = dev->dev->caps.sqp_start + 8 * slave + port + (dest_qpt * 2) - 1;
+   dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + 
(dest_qpt * 2) - 1;
 
/* get tunnel tx data buf for slave */
src_qp = tun_qp->qp;
@@ -1070,9 +1070,9 @@ static int mlx4_ib_multiplex_sa_handler(struct ib_device 
*ibdev, int port,
 
 static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)
 {
-   int slave_start = dev->dev->caps.sqp_start + 8 * slave;
+   int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave;
 
-   return (qpn >= slave_start && qpn <= slave_start + 1);
+   return (qpn >= proxy_start && qpn <= proxy_start + 1);
 }
 
 
@@ -1187,14 +1187,14 @@ static void mlx4_ib_multiplex_mad(struct 
mlx4_ib_demux_pv_ctx *ctx, struct ib_wc
int slave;
 
/* Get slave that sent this packet */
-   if (wc->src_qp < dev->dev->caps.sqp_start ||
-   wc->src_qp >= dev->dev->caps.base_tunnel_sqpn ||
+   if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn ||
+   wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * 
MLX4_MFUNC_MAX ||
(wc->src_qp & 0x1) != ctx->port - 1 ||
wc->src_qp & 0x4) {
mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", 
wc->src_qp);
return;
}
-   slave = ((wc->src_qp & ~0x7) - dev->dev->caps.sqp_start) / 8;
+   slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8;
if (slave != ctx->slave) {
mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: "
 "belongs to another slave\n", wc->src_qp);
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 3a3a690..24dcff8 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -116,33 +116,57 @@ static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct 
mlx4_ib_qp *qp)
if (!mlx4_is_master(dev->dev))
return 0;
 
-   return qp->mqp.qpn >= dev->dev->caps.base_sqpn &&
-  qp->mqp.qpn < dev->dev->caps.base_sqpn +
-  8 + 16 * MLX4_MFUNC_MAX;
+   return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn &&
+  qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn +
+   8 * MLX4_MFUNC_MAX;
 }
 
 static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 {
-   return ((mlx4_is_master(dev->dev) &&
-qp->mqp.qpn >= dev->dev->caps.base_sqpn &&
-qp->mqp.qpn <= dev-

[PATCH for-next V2 17/22] net/mlx4_core: INIT/CLOSE port logic for IB ports in SRIOV mode

2012-08-03 Thread Jack Morgenstein
Normally, INIT_PORT and CLOSE_PORT are invoked when special QP0
transitions to RTR, or transitions to ERR/RESET respectively.

In SRIOV mode, however, the master is also paravirtualized.
This in turn requires that we not do INIT_PORT until the
entire QP0 path (real QP0 and proxy QP0) is ready to receive.
When the real QP0 goes down, we should indicate that
the port is not active.

Signed-off-by: Jack Morgenstein 
---
 drivers/net/ethernet/mellanox/mlx4/fw.c |   71 +--
 drivers/net/ethernet/mellanox/mlx4/qp.c |   38 +++--
 2 files changed, 92 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c 
b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 88d61ff..1d11e81 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -1365,6 +1365,19 @@ out:
return err;
 }
 
+/* for IB-type ports only in SRIOV mode. Checks that both proxy QP0
+ * and real QP0 are active, so that the paravirtualized QP0 is ready
+ * to operate */
+static int check_qp0_state(struct mlx4_dev *dev, int function, int port)
+{
+   struct mlx4_priv *priv = mlx4_priv(dev);
+   /* irrelevant if not infiniband */
+   if (priv->mfunc.master.qp0_state[port].proxy_qp0_active &&
+   priv->mfunc.master.qp0_state[port].qp0_active)
+   return 1;
+   return 0;
+}
+
 int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave,
   struct mlx4_vhcr *vhcr,
   struct mlx4_cmd_mailbox *inbox,
@@ -1381,14 +1394,29 @@ int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int 
slave,
if (dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB)
return -ENODEV;
 
-   /* Enable port only if it was previously disabled */
-   if (!priv->mfunc.master.init_port_ref[port]) {
-   err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
-  MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
-   if (err)
-   return err;
+   if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
+   /* Enable port only if it was previously disabled */
+   if (!priv->mfunc.master.init_port_ref[port]) {
+   err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+  MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+   if (err)
+   return err;
+   }
+   priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << 
port);
+   } else {
+   if (slave == mlx4_master_func_num(dev)) {
+   if (check_qp0_state(dev, slave, port) &&
+   !priv->mfunc.master.qp0_state[port].port_active) {
+   err = mlx4_cmd(dev, 0, port, 0, 
MLX4_CMD_INIT_PORT,
+  MLX4_CMD_TIME_CLASS_A, 
MLX4_CMD_NATIVE);
+   if (err)
+   return err;
+   priv->mfunc.master.qp0_state[port].port_active 
= 1;
+   
priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+   }
+   } else
+   priv->mfunc.master.slave_state[slave].init_port_mask |= 
(1 << port);
}
-   priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
++priv->mfunc.master.init_port_ref[port];
return 0;
 }
@@ -1463,13 +1491,30 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int 
slave,
 
if (dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB)
return -ENODEV;
-   if (priv->mfunc.master.init_port_ref[port] == 1) {
-   err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000,
-  MLX4_CMD_NATIVE);
-   if (err)
-   return err;
+
+   if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
+   if (priv->mfunc.master.init_port_ref[port] == 1) {
+   err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+  1000, MLX4_CMD_NATIVE);
+   if (err)
+   return err;
+   }
+   priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << 
port);
+   } else {
+   /* infiniband port */
+   if (slave == mlx4_master_func_num(dev)) {
+   if (!priv->mfunc.master.qp0_state[port].qp0_active &&
+   priv->mfunc.master.qp0_state[port].port_active) {
+   err = mlx4_cmd(dev, 0, port, 0, 
MLX4_CMD_CLOSE_PORT,
+  1000, MLX4_CMD_NATIVE);
+   

[PATCH for-next V2 22/22] IB/mlx4: Create pv contexts for active VFs when PF (master) ib driver initializes

2012-08-03 Thread Jack Morgenstein
When have VFs and PFs on same host, the VFs are activated within the
mlx4_core module before the mlx4_ib kernel module is loaded.

When the mlx4_ib module initializes the PF (master), it now creates
mad paravirtualization contexts for any VFs already active.

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/mad.c  |3 +++
 drivers/infiniband/hw/mlx4/main.c |   11 +++
 2 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 31b0559..090a2ad 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -1648,6 +1648,9 @@ static int create_pv_resources(struct ib_device *ibdev, 
int slave, int port,
 {
int ret, cq_size;
 
+   if (ctx->state != DEMUX_PV_STATE_DOWN)
+   return -EEXIST;
+
ctx->state = DEMUX_PV_STATE_STARTING;
/* have QP0 only on port owner, and only if link layer is IB */
if (ctx->slave == mlx4_master_func_num(to_mdev(ctx->ib_dev)->dev) &&
diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 00384b3..82b7231 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -74,6 +74,8 @@ struct update_gid_work {
int port;
 };
 
+static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
+
 static struct workqueue_struct *wq;
 
 static void init_query_mad(struct ib_smp *mad)
@@ -1469,6 +1471,15 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
if (mlx4_is_mfunc(ibdev->dev))
init_pkeys(ibdev);
 
+   /* create paravirt contexts for any VFs which are active */
+   if (mlx4_is_master(ibdev->dev)) {
+   for (j = 0; j < MLX4_MFUNC_MAX; j++) {
+   if (j == mlx4_master_func_num(ibdev->dev))
+   continue;
+   if (mlx4_is_slave_active(ibdev->dev, j))
+   do_slave_init(ibdev, j, 1);
+   }
+   }
return ibdev;
 
 err_notif:
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH for-next V2 19/22] {NET,IB}/mlx4: Activate SRIOV mode for IB

2012-08-03 Thread Jack Morgenstein
Remove the error returns for IB ports from mlx4_ib_add,
mlx4_INIT_PORT_wrapper, and mlx4_CLOSE_PORT_wrapper.

Currently, SRIOV is supported only for devices for which the
link-layer is IB on all ports; RoCE support will be implemented at later time.

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/main.c   |8 ++--
 drivers/net/ethernet/mellanox/mlx4/fw.c |6 --
 include/linux/mlx4/device.h |4 
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 1166379..00384b3 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1292,11 +1292,15 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 
pr_info_once("%s", mlx4_ib_version);
 
-   if (mlx4_is_mfunc(dev)) {
-   pr_warn("IB not yet supported in SRIOV\n");
+   mlx4_foreach_non_ib_transport_port(i, dev)
+   num_ports++;
+
+   if (mlx4_is_mfunc(dev) && num_ports) {
+   dev_err(&dev->pdev->dev, "RoCE is not supported over SRIOV as 
yet\n");
return NULL;
}
 
+   num_ports = 0;
mlx4_foreach_ib_transport_port(i, dev)
num_ports++;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c 
b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 1d11e81..e36cbdb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -1391,9 +1391,6 @@ int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int 
slave,
if (priv->mfunc.master.slave_state[slave].init_port_mask & (1 << port))
return 0;
 
-   if (dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB)
-   return -ENODEV;
-
if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
/* Enable port only if it was previously disabled */
if (!priv->mfunc.master.init_port_ref[port]) {
@@ -1489,9 +1486,6 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int 
slave,
(1 << port)))
return 0;
 
-   if (dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB)
-   return -ENODEV;
-
if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
if (priv->mfunc.master.init_port_ref[port] == 1) {
err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 863fcea..f8c9316 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -697,6 +697,10 @@ struct mlx4_init_port_param {
for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \
if ((type) == (dev)->caps.port_mask[(port)])
 
+#define mlx4_foreach_non_ib_transport_port(port, dev) \
+   for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)   \
+   if (((dev)->caps.port_mask[port] != MLX4_PORT_TYPE_IB))
+
 #define mlx4_foreach_ib_transport_port(port, dev) \
for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)   \
if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH for-next V2 20/22] {NET,IB}/mlx4: Paravirtualize Node Guids for slaves

2012-08-03 Thread Jack Morgenstein
This is necessary in order to support > 1 VF/PF in a VM for software that
uses the node guid as a discriminator, such as librdmacm.

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/mad.c  |   14 ++
 drivers/infiniband/hw/mlx4/mlx4_ib.h  |3 +++
 drivers/net/ethernet/mellanox/mlx4/cmd.c  |   11 +++
 drivers/net/ethernet/mellanox/mlx4/main.c |   22 ++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |1 +
 include/linux/mlx4/device.h   |2 ++
 6 files changed, 53 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index cee7bf6..f7b9a41 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -89,6 +90,12 @@ static void handle_lid_change_event(struct mlx4_ib_dev *dev, 
u8 port_num);
 static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
int block, u32 change_bitmap);
 
+__be64 mlx4_ib_gen_node_guid(void)
+{
+#define NODE_GUID_HI   ((u64) (((u64)IB_OPENIB_OUI) << 40))
+   return (cpu_to_be64( NODE_GUID_HI | random32()));
+}
+
 __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
 {
return cpu_to_be64(atomic_inc_return(&ctx->tid)) |
@@ -1958,6 +1965,13 @@ int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev)
return 0;
}
 
+   for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+   if (i == mlx4_master_func_num(dev->dev))
+   mlx4_put_slave_node_guid(dev->dev, i, 
dev->ib_dev.node_guid);
+   else
+   mlx4_put_slave_node_guid(dev->dev, i, 
mlx4_ib_gen_node_guid());
+   }
+
err = mlx4_ib_init_alias_guid_service(dev);
if (err) {
mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n");
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h 
b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index e57a220..e04cbc9 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -723,4 +723,7 @@ int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev 
*device) ;
 
 void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device);
 
+__be64 mlx4_ib_gen_node_guid(void);
+
+
 #endif /* MLX4_IB_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 9717002..33df1db 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -651,6 +651,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int 
slave,
struct ib_smp *outsmp = outbox->buf;
__be16 *outtab = (__be16 *)(outsmp->data);
__be32 slave_cap_mask;
+   __be64 slave_node_guid;
port = vhcr->in_modifier;
 
if (smp->base_version == 1 &&
@@ -710,6 +711,16 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int 
slave,
}
return err;
}
+   if (smp->attr_id == IB_SMP_ATTR_NODE_INFO) {
+   err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+vhcr->in_modifier, 
vhcr->op_modifier,
+vhcr->op, MLX4_CMD_TIME_CLASS_C, 
MLX4_CMD_NATIVE);
+   if (!err) {
+   slave_node_guid =  
mlx4_get_slave_node_guid(dev, slave);
+   memcpy(outsmp->data + 12, 
&slave_node_guid, 8);
+   }
+   return err;
+   }
}
}
if (slave != mlx4_master_func_num(dev) &&
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c 
b/drivers/net/ethernet/mellanox/mlx4/main.c
index cd82607..d5e4238 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -438,6 +438,28 @@ void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, 
int port, int i, int
 }
 EXPORT_SYMBOL(mlx4_sync_pkey_table);
 
+void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid)
+{
+   struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+   if (!mlx4_is_master(dev))
+   return;
+
+   priv->slave_node_guids[slave] = guid;
+}
+EXPORT_SYMBOL(mlx4_put_slave_node_guid);
+
+__be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave)
+{
+   struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+   if (!mlx4_is_master(dev))
+   return 0;
+
+   return priv->slave_node_guids[slave];
+}
+EXPORT_SYMBOL(mlx4_get_slave_node_guid);
+
 int mlx4_is_slave_active(struc

[PATCH for-next V2 16/22] net/mlx4_core: Adjustments to SET_PORT for SRIOV-IB

2012-08-03 Thread Jack Morgenstein
1. Slave may not set the IS_SM capability for the port.
2. No DEV_MGR in multifunc mode.

Signed-off-by: Jack Morgenstein 
---
 drivers/net/ethernet/mellanox/mlx4/port.c |   10 ++
 include/linux/mlx4/device.h   |5 +
 2 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c 
b/drivers/net/ethernet/mellanox/mlx4/port.c
index e36dd0f..8ead556 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -732,6 +732,16 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int 
slave, u32 in_mod,
new_cap_mask = ((__be32 *) inbox->buf)[1];
}
 
+   /* slave may not set the IS_SM capability for the port */
+   if (slave != mlx4_master_func_num(dev) &&
+   (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_IS_SM))
+   return -EINVAL;
+
+   /* NO DEV_MGR in multifunc mode */
+   if (mlx4_is_mfunc(dev) &&
+   (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_DEV_MGMT_SUP))
+   return -EINVAL;
+
agg_cap_mask = 0;
slave_cap_mask =
priv->mfunc.master.slave_state[slave].ib_cap_mask[port];
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 8fed1fc..863fcea 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -54,6 +54,11 @@ enum {
 };
 
 enum {
+   MLX4_PORT_CAP_IS_SM = 1 << 1,
+   MLX4_PORT_CAP_DEV_MGMT_SUP = 1 << 19,
+};
+
+enum {
MLX4_MAX_PORTS  = 2,
MLX4_MAX_PORT_PKEYS = 128
 };
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH for-next V2 18/22] IB/mlx4: Miscellaneous adjustments to SRIOV IB support

2012-08-03 Thread Jack Morgenstein
1. allow only master to change node description
2. prevent ah leakage in send mads
3. take device part number from PCI structure, so that
   guests see the VF part number (and not the PF part number)
4. place the device revision ID into caps structure at startup
5. SET_PORT in update_gids_task needs to go through wrapper on master.
6. in mlx4_ib_event, PORT_MGMT_EVENT needs be handled in a work queue
   on the master, since it propagates events to slaves using GEN_EQE
7. Do not support FMR on slaves.
8. Add spinlock to slave_event(), since it is called both in interrupt
   context and in process context (due to 6 above, and also if smp_snoop
   is used).
   This fix was found and implemented by Saeed Mahameed 

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/mad.c  |6 +-
 drivers/infiniband/hw/mlx4/main.c |   26 ++
 drivers/net/ethernet/mellanox/mlx4/cmd.c  |1 +
 drivers/net/ethernet/mellanox/mlx4/eq.c   |8 ++--
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |1 +
 5 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index ec64077..cee7bf6 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -705,7 +705,9 @@ static int ib_process_mad(struct ib_device *ibdev, int 
mad_flags, u8 port_num,
if (!out_mad->mad_hdr.status) {
if (!(to_mdev(ibdev)->dev->caps.flags & 
MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV))
smp_snoop(ibdev, port_num, in_mad, prev_lid);
-   node_desc_override(ibdev, out_mad);
+   /* slaves get node desc from FW */
+   if (!mlx4_is_slave(to_mdev(ibdev)->dev))
+   node_desc_override(ibdev, out_mad);
}
 
/* set return bit in status of directed route responses */
@@ -788,6 +790,8 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int 
mad_flags, u8 port_num,
 static void send_handler(struct ib_mad_agent *agent,
 struct ib_mad_send_wc *mad_send_wc)
 {
+   if (mad_send_wc->send_buf->context[0])
+   ib_destroy_ah(mad_send_wc->send_buf->context[0]);
ib_free_send_mad(mad_send_wc->send_buf);
 }
 
diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 778fdb8..1166379 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -138,7 +138,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 
props->vendor_id   = be32_to_cpup((__be32 *) (out_mad->data + 
36)) &
0xff;
-   props->vendor_part_id  = be16_to_cpup((__be16 *) (out_mad->data + 
30));
+   props->vendor_part_id  = dev->dev->pdev->device;
props->hw_ver  = be32_to_cpup((__be32 *) (out_mad->data + 
32));
memcpy(&props->sys_image_guid, out_mad->data +  4, 8);
 
@@ -477,6 +477,9 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, 
int mask,
if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
return 0;
 
+   if (mlx4_is_slave(to_mdev(ibdev)->dev))
+   return -EOPNOTSUPP;
+
spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags);
memcpy(ibdev->node_desc, props->node_desc, 64);
spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags);
@@ -492,7 +495,7 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, 
int mask,
memset(mailbox->buf, 0, 256);
memcpy(mailbox->buf, props->node_desc, 64);
mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,
-MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 
mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox);
 
@@ -920,6 +923,7 @@ static int init_node_data(struct mlx4_ib_dev *dev)
if (err)
goto out;
 
+   dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
 
 out:
@@ -1008,7 +1012,7 @@ static void update_gids_task(struct work_struct *work)
 
err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | 
gw->port,
   1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
-  MLX4_CMD_NATIVE);
+  MLX4_CMD_WRAPPED);
if (err)
pr_warn("set port command failed\n");
else {
@@ -1399,10 +1403,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
ibdev->ib_dev.detach_mcast  = mlx4_ib_mcg_detach;
ibdev->ib_dev.process_mad   = mlx4_ib_process_mad;
 
-   ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc;
-   ibdev->ib_dev.map_phys_fmr  = mlx4_

[PATCH for-next V2 10/22] IB/mlx4: Added Multicast Groups (MCG) para-virtualization for SRIOV

2012-08-03 Thread Jack Morgenstein
From: Oren Duer 

MCG para-virtualization support includes:
- Creating multicast groups by VFs, and keeping accounting of them
- Leaving multicast groups by VFs
- SM will only be updated with real changes in the overall picture of MCGs 
status
- Creation of MGID=0 groups (let SM choose MGID)

Note that the MCG module maintains its own internal MCG object reference
counts. The reason for this is that the IB core is used to track only
the multicast groups joins generated by the PF it runs over.
The PF ib-core layer is unaware of slaves, so it cannot be used to keep
track of MCG joins they generate.

Signed-off-by: Oren Duer 
Signed-off-by: Eli Cohen 
Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/Makefile  |2 +-
 drivers/infiniband/hw/mlx4/mad.c |   60 ++-
 drivers/infiniband/hw/mlx4/main.c|   18 +-
 drivers/infiniband/hw/mlx4/mcg.c | 1187 ++
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   30 +
 5 files changed, 1285 insertions(+), 12 deletions(-)
 create mode 100644 drivers/infiniband/hw/mlx4/mcg.c

diff --git a/drivers/infiniband/hw/mlx4/Makefile 
b/drivers/infiniband/hw/mlx4/Makefile
index 70f09c7..20d627d 100644
--- a/drivers/infiniband/hw/mlx4/Makefile
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_MLX4_INFINIBAND)  += mlx4_ib.o
 
-mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o
+mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 4c8650f..195b211 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -75,6 +75,14 @@ struct mlx4_rcv_tunnel_mad {
struct ib_mad mad;
 } __packed;
 
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num);
+
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
+{
+   return cpu_to_be64(atomic_inc_return(&ctx->tid)) |
+   cpu_to_be64(0xff00LL);
+}
+
 int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
 void *in_mad, void *response_mad)
@@ -208,8 +216,7 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, 
struct ib_mad *mad,
 pinfo->neighbormtu_mastersmsl & 0xf);
 
if (pinfo->clientrereg_resv_subnetto & 0x80)
-   mlx4_ib_dispatch_event(dev, port_num,
-  
IB_EVENT_CLIENT_REREGISTER);
+   handle_client_rereg_event(dev, port_num);
 
if (prev_lid != lid)
mlx4_ib_dispatch_event(dev, port_num,
@@ -304,7 +311,17 @@ static void forward_trap(struct mlx4_ib_dev *dev, u8 
port_num, struct ib_mad *ma
 static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int 
slave,
 struct ib_sa_mad 
*sa_mad)
 {
-   return 0;
+   int ret = 0;
+
+   /* dispatch to different sa handlers */
+   switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+   case IB_SA_ATTR_MC_MEMBER_REC:
+   ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad);
+   break;
+   default:
+   break;
+   }
+   return ret;
 }
 
 int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid)
@@ -764,6 +781,16 @@ void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
}
 }
 
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num)
+{
+   /* re-configure the mcg's */
+   if (mlx4_is_master(dev->dev)) {
+   if (!dev->sriov.is_going_down)
+   mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 
1], 0);
+   }
+   mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER);
+}
+
 void handle_port_mgmt_change_event(struct work_struct *work)
 {
struct ib_event_work *ew = container_of(work, struct ib_event_work, 
work);
@@ -793,8 +820,7 @@ void handle_port_mgmt_change_event(struct work_struct *work)
mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
 
if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK)
-   mlx4_ib_dispatch_event(dev, port,
-  IB_EVENT_CLIENT_REREGISTER);
+   handle_client_rereg_event(dev, port);
break;
 
case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE:
@@ -864,7 +890,17 @@ static int mlx4_ib_post_pv_qp_buf(struct 
mlx4_ib_demux_pv_ctx *ctx,
 static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port,
int slave, struct ib_sa_mad *sa_mad)
 {
-   return 0;
+   int ret = 0;
+
+   /* dispatch to different sa handlers */
+

[PATCH for-next V2 11/22] IB/mlx4: Add CM paravirtualization

2012-08-03 Thread Jack Morgenstein
From: Amir Vadai 

In CM para-virtualization:
1. Incoming requests are steered to the correct vHCA according to the embedded 
GID
2. Communication IDs on outgoing requests are replaced by a globally unique ID, 
generated
   by the PPF, since there is no synchronization of ID generation between 
guests (and so these
   IDs are not guaranteed to be globally unique).
   The guest's comm ID is stored, and is returned to the response MAD when it 
arrives.

Signed-off-by: Amir Vadai 
Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/Makefile  |2 +-
 drivers/infiniband/hw/mlx4/cm.c  |  437 ++
 drivers/infiniband/hw/mlx4/mad.c |   16 ++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   15 ++
 4 files changed, 468 insertions(+), 2 deletions(-)
 create mode 100644 drivers/infiniband/hw/mlx4/cm.c

diff --git a/drivers/infiniband/hw/mlx4/Makefile 
b/drivers/infiniband/hw/mlx4/Makefile
index 20d627d..bf0aa90 100644
--- a/drivers/infiniband/hw/mlx4/Makefile
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_MLX4_INFINIBAND)  += mlx4_ib.o
 
-mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o
+mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o
diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
new file mode 100644
index 000..0a52146
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+
+#include "mlx4_ib.h"
+
+#define CM_CLEANUP_CACHE_TIMEOUT  (5 * HZ)
+
+struct id_map_entry {
+   struct rb_node node;
+
+   u32 sl_cm_id;
+   u32 pv_cm_id;
+   int slave_id;
+   int scheduled_delete;
+   struct mlx4_ib_dev *dev;
+
+   struct list_head list;
+   struct delayed_work timeout;
+};
+
+struct cm_generic_msg {
+   struct ib_mad_hdr hdr;
+
+   __be32 local_comm_id;
+   __be32 remote_comm_id;
+};
+
+struct cm_req_msg {
+   unsigned char unused[0x60];
+   union ib_gid primary_path_sgid;
+};
+
+
+static void set_local_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+   struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+   msg->local_comm_id = cpu_to_be32(cm_id);
+}
+
+static u32 get_local_comm_id(struct ib_mad *mad)
+{
+   struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+
+   return be32_to_cpu(msg->local_comm_id);
+}
+
+static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+   struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+   msg->remote_comm_id = cpu_to_be32(cm_id);
+}
+
+static u32 get_remote_comm_id(struct ib_mad *mad)
+{
+   struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+
+   return be32_to_cpu(msg->remote_comm_id);
+}
+
+static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad 
*mad)
+{
+   struct cm_req_msg *msg = (struct cm_req_msg *)mad;
+
+   return msg->primary_path_sgid;
+}
+
+/* Lock should be taken before called */
+static struct id_map_entry *
+id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id)
+{
+   struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+   struct rb_node *node = sl_id_map->rb_node;
+
+   while (node) {
+   struct id_map_entry *id_map_entry =
+   rb_entry(node, struct id_map_entry, node);
+
+   if (id_map_entry->sl_cm_id > sl_cm_id)
+ 

[PATCH for-next V2 14/22] IB/mlx4: Propagate pkey and guid change port management events to slaves

2012-08-03 Thread Jack Morgenstein
pkey change and guid change events are not of interest to all slaves,
but only to those slaves which "see" the table slots whose contents
have change.

For example, if the guid at port 1, index 5 has changed in the
PPF, we wish to propagate the gid-change event only to the function
which has that guid index mapped to its port/guid table (in this case
it is slave #5). Other functions should not get the event,
since the event does not affect them.

Similarly with pkeys -- pkey change events are forwarded
only to slaves which have that pkey index mapped to their
virtual pkey table.

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/mad.c|  162 +--
 drivers/net/ethernet/mellanox/mlx4/fw.c |6 +
 2 files changed, 161 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 0db3e9c..ef9842d 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -54,6 +54,15 @@ enum {
 #define MLX4_TUN_IS_RECV(a)  (((a) >>  MLX4_TUN_SEND_WRID_SHIFT) & 0x1)
 #define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3)
 
+ /* Port mgmt change event handling */
+
+#define GET_BLK_PTR_FROM_EQE(eqe) 
be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr)
+#define GET_MASK_FROM_EQE(eqe) 
be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask)
+#define NUM_IDX_IN_PKEY_TBL_BLK 32
+#define GUID_TBL_ENTRY_SIZE 8 /* size in bytes */
+#define GUID_TBL_BLK_NUM_ENTRIES 8
+#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES)
+
 struct mlx4_mad_rcv_buf {
struct ib_grh grh;
u8 payload[256];
@@ -76,6 +85,9 @@ struct mlx4_rcv_tunnel_mad {
 } __packed;
 
 static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+   int block, u32 change_bitmap);
 
 __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
 {
@@ -219,8 +231,7 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, 
struct ib_mad *mad,
handle_client_rereg_event(dev, port_num);
 
if (prev_lid != lid)
-   mlx4_ib_dispatch_event(dev, port_num,
-  IB_EVENT_LID_CHANGE);
+   handle_lid_change_event(dev, port_num);
break;
 
case IB_SMP_ATTR_PKEY_TABLE:
@@ -230,6 +241,9 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, 
struct ib_mad *mad,
break;
}
 
+   /* at this point, we are running in the master.
+* Slaves do not receive SMPs.
+*/
bn  = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 
0x;
base = (__be16 *) &(((struct ib_smp *)mad)->data[0]);
pkey_change_bitmap = 0;
@@ -247,10 +261,13 @@ static void smp_snoop(struct ib_device *ibdev, u8 
port_num, struct ib_mad *mad,
 "block=0x%x, change_bitmap=0x%x\n",
 port_num, bn, pkey_change_bitmap);
 
-   if (pkey_change_bitmap)
+   if (pkey_change_bitmap) {
mlx4_ib_dispatch_event(dev, port_num,
   IB_EVENT_PKEY_CHANGE);
-
+   if (!dev->sriov.is_going_down)
+   __propagate_pkey_ev(dev, port_num, bn,
+   pkey_change_bitmap);
+   }
break;
 
case IB_SMP_ATTR_GUID_INFO:
@@ -258,12 +275,56 @@ static void smp_snoop(struct ib_device *ibdev, u8 
port_num, struct ib_mad *mad,
if (!mlx4_is_master(dev->dev))
mlx4_ib_dispatch_event(dev, port_num,
   IB_EVENT_GID_CHANGE);
+   /*if master, notify relevant slaves*/
+   if (mlx4_is_master(dev->dev) &&
+   !dev->sriov.is_going_down) {
+   bn = be32_to_cpu(((struct ib_smp 
*)mad)->attr_mod);
+   mlx4_ib_update_cache_on_guid_change(dev, bn, 
port_num,
+   (u8 
*)(&((struct ib_smp *)mad)->data));
+   mlx4_ib_notify_slaves_on_guid_change(dev, bn, 
port_num,
+   

[PATCH for-next V2 08/22] IB/mlx4: SRIOV multiplex and demultiplex MADs

2012-08-03 Thread Jack Morgenstein
Special QPs are para-virtualized.

vHCAs are not given direct access to QP0/1. Rather, these QPs are operated by a
special context hosted by the PF, which mediates access to/from vHCAs.
This is done by opening a “tunnel” per vHCA port per QP0/1. A tunnel comprises
a pair of UD QPs:  a “Tunnel QP” in the PF-context and a “Proxy QP” in the vHCA.
All vHCA MAD traffic must pass through the corresponding tunnel.
vHCA QPs cannot be assigned to VL15 and are denied of the well-known QKey.

Outgoing messages are "de-multiplexed" (i.e., directed to the wire via
the real special QP).

Incoming messages are "multiplexed" (i.e. steered by the PPF to the correct
VF or to the PF)

QP0 access is restricted to the PF vHCA. VF vHCAs also have (virtual) QP0’s,
but they never receive any SMPs and all SMPs sent are discarded.
QP1 traffic is allowed for all vHCAs, but special care is required to bridge
the gap between the host and network views.

Specifically:
- Transaction IDs are mapped to guarantee uniqueness among vHCAs
- CM para-virtualization
  o   Incoming requests are steered to the correct vHCA according to the 
embedded GID
  o   Local communication IDs are mapped to ensure uniqueness among vHCAs
  (see the patch that adds CM paravirtualization.)
- Multicast para-virtualization
  o   The PF context aggregates membership state from all vHCAs
  o   The SA is contacted only when the aggregate membership changes
  o   If the aggregate does not change, the PF context will provide the
   requesting vHCA with the proper response
  (see the patch that adds multicast group paravirtualization.)

Incoming MADs are steered according to:
- the DGID If a GRH is present
- the mapped transaction ID for response MADs
- the embedded GID in CM requests
- the remote communication ID in other CM messages

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/mad.c |  567 +-
 1 files changed, 565 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 2413a08..a392a5c 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -32,6 +32,8 @@
 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -296,6 +298,254 @@ static void forward_trap(struct mlx4_ib_dev *dev, u8 
port_num, struct ib_mad *ma
}
 }
 
+static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int 
slave,
+struct ib_sa_mad 
*sa_mad)
+{
+   return 0;
+}
+
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid)
+{
+   struct mlx4_ib_dev *dev = to_mdev(ibdev);
+   int i;
+
+   for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+   if (dev->sriov.demux[port - 1].guid_cache[i] == guid)
+   return i;
+   }
+   return -1;
+}
+
+
+static int get_pkey_phys_indices(struct mlx4_ib_dev *ibdev, u8 port, u8 
ph_pkey_ix,
+u8 *full_pk_ix, u8 *partial_pk_ix,
+int *is_full_member)
+{
+   u16 search_pkey;
+   int fm;
+   int err = 0;
+   u16 pk;
+
+   err = ib_get_cached_pkey(&ibdev->ib_dev, port, ph_pkey_ix, 
&search_pkey);
+   if (err)
+   return err;
+
+   fm = (search_pkey & 0x8000) ? 1 : 0;
+   if (fm) {
+   *full_pk_ix = ph_pkey_ix;
+   search_pkey &= 0x7FFF;
+   } else {
+   *partial_pk_ix = ph_pkey_ix;
+   search_pkey |= 0x8000;
+   }
+
+   if (ib_find_exact_cached_pkey(&ibdev->ib_dev, port, search_pkey, &pk))
+   pk = 0x;
+
+   if (fm)
+   *partial_pk_ix = (pk & 0xFF);
+   else
+   *full_pk_ix = (pk & 0xFF);
+
+   *is_full_member = fm;
+   return err;
+}
+
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
+ enum ib_qp_type dest_qpt, struct ib_wc *wc,
+ struct ib_grh *grh, struct ib_mad *mad)
+{
+   struct ib_sge list;
+   struct ib_send_wr wr, *bad_wr;
+   struct mlx4_ib_demux_pv_ctx *tun_ctx;
+   struct mlx4_ib_demux_pv_qp *tun_qp;
+   struct mlx4_rcv_tunnel_mad *tun_mad;
+   struct ib_ah_attr attr;
+   struct ib_ah *ah;
+   struct ib_qp *src_qp = NULL;
+   unsigned tun_tx_ix = 0;
+   int dqpn;
+   int ret = 0;
+   int i;
+   int is_full_member = 0;
+   u16 tun_pkey_ix;
+   u8 ph_pkey_ix, full_pk_ix = 0, partial_pk_ix = 0;
+
+   if (dest_qpt > IB_QPT_GSI)
+   return -EINVAL;
+
+   tun_ctx = dev->sriov.demux[port-1].tun[slave];
+
+   /* check if proxy qp created */
+   if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE)
+   return -EAGAIN;
+
+   /* QP0 forwarding only for Dom0 */
+   if (!dest_q

[PATCH for-next V2 13/22] {NET,IB}/mlx4: Add alias_guid mechanism

2012-08-03 Thread Jack Morgenstein
For IB ports, we paravirtualize the GUID at index 0 on slaves.
The GUID at index 0 seen by a slave is the actual GUID occupying
the GUID table at the slave-id index.

The driver, by default, request at startup time that subnet manager
populate its entire guid table with GUIDs. These guids are then
mapped (paravirtualized) to the slaves, and appear for each slave
as its GUID at index 0.

Until each slave has such a guid, its port status is DOWN.

The guid table is cached to support SQP paravirtualization, and event
propagation to slaves on guid change (we test to see if the guid really
changed before propagating an event to the slave).

To support this caching, added capability to __mlx4_ib_query_gid()
to obtain the network view (i.e., physical view) gid at index X,
not just the host (paravirtualized) view.

Based on a patch from Erez Shitrit 

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/Makefile  |2 +-
 drivers/infiniband/hw/mlx4/alias_GUID.c  |  688 ++
 drivers/infiniband/hw/mlx4/mad.c |   19 +-
 drivers/infiniband/hw/mlx4/main.c|   37 ++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   74 
 drivers/net/ethernet/mellanox/mlx4/cmd.c |6 +-
 6 files changed, 816 insertions(+), 10 deletions(-)
 create mode 100644 drivers/infiniband/hw/mlx4/alias_GUID.c

diff --git a/drivers/infiniband/hw/mlx4/Makefile 
b/drivers/infiniband/hw/mlx4/Makefile
index bf0aa90..31d4c8a 100644
--- a/drivers/infiniband/hw/mlx4/Makefile
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_MLX4_INFINIBAND)  += mlx4_ib.o
 
-mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o
+mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o 
alias_GUID.o
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c 
b/drivers/infiniband/hw/mlx4/alias_GUID.c
new file mode 100644
index 000..9db1581
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+ /***/
+/*This file support the handling of the Alias GUID feature. */
+/***/
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "mlx4_ib.h"
+
+/*
+The driver keeps the current state of all guids, as they are in the HW.
+Whenever we receive an smp mad GUIDInfo record, the data will be cached.
+*/
+
+struct mlx4_alias_guid_work_context {
+   u8 port;
+   struct mlx4_ib_dev *dev ;
+   struct ib_sa_query *sa_query;
+   struct completion   done;
+   int query_id;
+   struct list_headlist;
+   int block_num;
+};
+
+struct mlx4_next_alias_guid_work {
+   u8 port;
+   u8 block_num;
+   struct mlx4_sriov_alias_guid_info_rec_det rec_det;
+};
+
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int 
block_num,
+u8 port_num, u8 *p_data)
+{
+   int i;
+   u64 guid_indexes;
+   int slave_id;
+   int port_index = port_num - 1;
+
+   if (!mlx4_is_master(dev->dev))
+   return;
+
+   guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+  ports_guid[port_num - 1].
+  all_rec_per_port[block_num].guid_indexes);
+   pr_debug("p

[PATCH for-next V2 12/22] net/mlx4_core: Add IB port-state machine, and port mgmt event propagation infrastructure

2012-08-03 Thread Jack Morgenstein
For an IB port, a slave should not show port active until that
slave has a valid alias-guid (provided by the subnet manager).
Therefore the port-up event should be passed to a slave only after
both the port is up, and the slave's alias-guid has been set.

Also, provide the infrastructure for propagating port-management
events (client-reregister, etc) to slaves.

Signed-off-by: Jack Morgenstein 
---
 drivers/net/ethernet/mellanox/mlx4/eq.c   |  237 +++--
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |1 +
 include/linux/mlx4/device.h   |   28 
 3 files changed, 250 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c 
b/drivers/net/ethernet/mellanox/mlx4/eq.c
index 99a0464..c425826 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -200,6 +200,196 @@ static void mlx4_slave_event(struct mlx4_dev *dev, int 
slave,
slave_event(dev, slave, eqe);
 }
 
+int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port)
+{
+   struct mlx4_eqe eqe;
+
+   struct mlx4_priv *priv = mlx4_priv(dev);
+   struct mlx4_slave_state *s_slave = 
&priv->mfunc.master.slave_state[slave];
+
+   if (!s_slave->active)
+   return 0;
+
+   memset(&eqe, 0, sizeof eqe);
+
+   eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+   eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE;
+   eqe.event.port_mgmt_change.port = port;
+
+   return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_pkey_eqe);
+
+int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port)
+{
+   struct mlx4_eqe eqe;
+
+   /*don't send if we don't have the that slave */
+   if (dev->num_vfs < slave)
+   return 0;
+   memset(&eqe, 0, sizeof eqe);
+
+   eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+   eqe.subtype = MLX4_DEV_PMC_SUBTYPE_GUID_INFO;
+   eqe.event.port_mgmt_change.port = port;
+
+   return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_guid_change_eqe);
+
+int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port,
+  u8 port_subtype_change)
+{
+   struct mlx4_eqe eqe;
+
+   /*don't send if we don't have the that slave */
+   if (dev->num_vfs < slave)
+   return 0;
+   memset(&eqe, 0, sizeof eqe);
+
+   eqe.type = MLX4_EVENT_TYPE_PORT_CHANGE;
+   eqe.subtype = port_subtype_change;
+   eqe.event.port_change.port = cpu_to_be32(port << 28);
+
+   mlx4_dbg(dev, "%s: sending: %d to slave: %d on port: %d\n", __func__,
+port_subtype_change, slave, port);
+   return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_port_state_change_eqe);
+
+enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int 
slave, u8 port)
+{
+   struct mlx4_priv *priv = mlx4_priv(dev);
+   struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
+   if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS) {
+   pr_err("%s: Error: asking for slave:%d, port:%d\n",
+  __func__, slave, port);
+   return SLAVE_PORT_DOWN;
+   }
+   return s_state[slave].port_state[port];
+}
+EXPORT_SYMBOL(mlx4_get_slave_port_state);
+
+static int mlx4_set_slave_port_state(struct mlx4_dev *dev, int slave, u8 port,
+enum slave_port_state state)
+{
+   struct mlx4_priv *priv = mlx4_priv(dev);
+   struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
+
+   if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS || port == 0) {
+   pr_err("%s: Error: asking for slave:%d, port:%d\n",
+  __func__, slave, port);
+   return -1;
+   }
+   s_state[slave].port_state[port] = state;
+
+   return 0;
+}
+
+static void set_all_slave_state(struct mlx4_dev *dev, u8 port, int event)
+{
+   int i;
+   enum slave_port_gen_event gen_event;
+
+   for (i = 0; i < dev->num_slaves; i++)
+   set_and_calc_slave_port_state(dev, i, port, event, &gen_event);
+}
+/**
+   The function get as input the new event to that port,
+   and according to the prev state change the slave's port state.
+   The events are:
+   MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
+   MLX4_PORT_STATE_DEV_EVENT_PORT_UP
+   MLX4_PORT_STATE_IB_EVENT_GID_VALID
+   MLX4_PORT_STATE_IB_EVENT_GID_INVALID
+***/
+int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave,
+ u8 port, int event,
+

[PATCH for-next V2 07/22] {NET,IB}/mlx4: Implement QP paravirtualization and maintain phys_pkey_cache for smp_snoop

2012-08-03 Thread Jack Morgenstein
The requires:
1. Replacing the paravirtualized pkey index (inserted
   by the guest) with the real pkey index

2. For UD qp's, placing the guest's true source gid index
   in the address path structure mgid field, and setting
   the ud_force_mgid bit so that the mgid is taken from
   the qp context and not from the WQE when posting sends.

3. For UC and RC QPs, placing the guest's true source gid index
   in the address path structure mgid field.

4. For tunnel and proxy QPs, setting the QKEY value reserved for
   that proxy/tunnel pair.

Since not all the above adjustments occur in all the QP transitions,
the QP transitions require separate wrapper functions.

Secondly, initialize the pkey virtualization table to its default
values: Master virtualized table is 1-1 with the real pkey table,
guest virtualized table has pkey index 0 mapped to the real
pkey index 0, and all the other pkey indices mapped to the
reserved (invalid) pkey at index 127.

Finally, Add logic in smp_snoop for maintaining the phys_pkey_cache.
and generating events on the master only if a pkey actually changed.

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/mad.c   |   33 +++-
 drivers/infiniband/hw/mlx4/main.c  |   35 +++
 drivers/net/ethernet/mellanox/mlx4/cmd.c   |   12 +-
 drivers/net/ethernet/mellanox/mlx4/main.c  |   11 +
 drivers/net/ethernet/mellanox/mlx4/mlx4.h  |   47 +
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |  217 +++-
 include/linux/mlx4/device.h|3 +
 7 files changed, 344 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index d955cc4..2413a08 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -184,6 +184,10 @@ static void smp_snoop(struct ib_device *ibdev, u8 
port_num, struct ib_mad *mad,
 {
struct ib_port_info *pinfo;
u16 lid;
+   __be16 *base;
+   u32 bn, pkey_change_bitmap;
+   int i;
+
 
struct mlx4_ib_dev *dev = to_mdev(ibdev);
if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
@@ -208,8 +212,33 @@ static void smp_snoop(struct ib_device *ibdev, u8 
port_num, struct ib_mad *mad,
break;
 
case IB_SMP_ATTR_PKEY_TABLE:
-   mlx4_ib_dispatch_event(dev, port_num,
-  IB_EVENT_PKEY_CHANGE);
+   if (!mlx4_is_mfunc(dev->dev)) {
+   mlx4_ib_dispatch_event(dev, port_num,
+  IB_EVENT_PKEY_CHANGE);
+   break;
+   }
+
+   bn  = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 
0x;
+   base = (__be16 *) &(((struct ib_smp *)mad)->data[0]);
+   pkey_change_bitmap = 0;
+   for (i = 0; i < 32; i++) {
+   pr_debug("PKEY[%d] = x%x\n",
+i + bn*32, be16_to_cpu(base[i]));
+   if (be16_to_cpu(base[i]) !=
+   dev->pkeys.phys_pkey_cache[port_num - 1][i 
+ bn*32]) {
+   pkey_change_bitmap |= (1 << i);
+   dev->pkeys.phys_pkey_cache[port_num - 
1][i + bn*32] =
+   be16_to_cpu(base[i]);
+   }
+   }
+   pr_debug("PKEY Change event: port=%d, "
+"block=0x%x, change_bitmap=0x%x\n",
+port_num, bn, pkey_change_bitmap);
+
+   if (pkey_change_bitmap)
+   mlx4_ib_dispatch_event(dev, port_num,
+  IB_EVENT_PKEY_CHANGE);
+
break;
 
case IB_SMP_ATTR_GUID_INFO:
diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 075561f..bdfd3ae 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1120,6 +1120,38 @@ static int mlx4_ib_netdev_event(struct notifier_block 
*this, unsigned long event
return NOTIFY_DONE;
 }
 
+static void init_pkeys(struct mlx4_ib_dev *ibdev)
+{
+   int port;
+   int slave;
+   int i;
+
+   if (mlx4_is_master(ibdev->dev)) {
+   for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) {
+   for (port = 1; port <= ibdev->dev->caps.num_ports; 
++port) {
+   for (i = 0;
+i < 
ibdev->dev->phys_caps.pkey_phys_table_len[port];
+  

[PATCH for-next V2 06/22] IB/mlx4: Initialize SRIOV IB support for slaves in master context

2012-08-03 Thread Jack Morgenstein
Allocate sriov paravirtualization resources and mad demuxing contexts
on the master.

This has two parts.  The first part is to initialize the structures
to contain the contexts.  This is done at master startup time
(mlx4_ib_init_sriov).

The second part is to actually create the tunneling resources required
on the master to support a slave.  This is performed the master detects
that a slave has started up (MLX4_DEV_EVENT_SLAVE_INIT event generated
when a slave initializes its comm channel).

For the master, there is no such startup event, so it creates its own
tunneling resources when it starts up.  In addition, the master also
creates the real special QPs. (The ib_core layer on the master causes
creation of proxy special qp's, since the master is also
paravirtualized at the ib_core layer).

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/mad.c |  684 ++
 drivers/infiniband/hw/mlx4/main.c|   80 -
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   34 ++
 drivers/net/ethernet/mellanox/mlx4/cmd.c |3 +
 include/linux/mlx4/device.h  |3 +-
 include/linux/mlx4/driver.h  |2 +
 6 files changed, 798 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index c27141f..d955cc4 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -44,6 +44,35 @@ enum {
MLX4_IB_VENDOR_CLASS2 = 0xa
 };
 
+#define MLX4_TUN_SEND_WRID_SHIFT 34
+#define MLX4_TUN_QPN_SHIFT 32
+#define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT)
+#define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT)
+
+#define MLX4_TUN_IS_RECV(a)  (((a) >>  MLX4_TUN_SEND_WRID_SHIFT) & 0x1)
+#define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3)
+
+struct mlx4_mad_rcv_buf {
+   struct ib_grh grh;
+   u8 payload[256];
+} __packed;
+
+struct mlx4_mad_snd_buf {
+   u8 payload[256];
+} __packed;
+
+struct mlx4_tunnel_mad {
+   struct ib_grh grh;
+   struct mlx4_ib_tunnel_header hdr;
+   struct ib_mad mad;
+} __packed;
+
+struct mlx4_rcv_tunnel_mad {
+   struct mlx4_rcv_tunnel_hdr hdr;
+   struct ib_grh grh;
+   struct ib_mad mad;
+} __packed;
+
 int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey,
 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
 void *in_mad, void *response_mad)
@@ -512,3 +541,658 @@ void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 
port_num,
 
ib_dispatch_event(&event);
 }
+
+static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg)
+{
+   unsigned long flags;
+   struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context;
+   struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+   spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+   if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE)
+   queue_work(ctx->wq, &ctx->work);
+   spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
+ struct mlx4_ib_demux_pv_qp *tun_qp,
+ int index)
+{
+   struct ib_sge sg_list;
+   struct ib_recv_wr recv_wr, *bad_recv_wr;
+   int size;
+
+   size = (tun_qp->qp->qp_type == IB_QPT_UD) ?
+   sizeof(struct mlx4_tunnel_mad) : sizeof(struct 
mlx4_mad_rcv_buf);
+
+   sg_list.addr = tun_qp->ring[index].map;
+   sg_list.length = size;
+   sg_list.lkey = ctx->mr->lkey;
+
+   recv_wr.next = NULL;
+   recv_wr.sg_list = &sg_list;
+   recv_wr.num_sge = 1;
+   recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV |
+   MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt);
+   ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map,
+ size, DMA_FROM_DEVICE);
+   return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr);
+}
+
+static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
+enum ib_qp_type qp_type, int is_tun)
+{
+   int i;
+   struct mlx4_ib_demux_pv_qp *tun_qp;
+   int rx_buf_size, tx_buf_size;
+
+   if (qp_type > IB_QPT_GSI)
+   return -EINVAL;
+
+   tun_qp = &ctx->qp[qp_type];
+
+   tun_qp->ring = kzalloc(sizeof(struct mlx4_ib_buf) * 
MLX4_NUM_TUNNEL_BUFS,
+  GFP_KERNEL);
+   if (!tun_qp->ring)
+   return -ENOMEM;
+
+   tun_qp->tx_ring = kzalloc(sizeof(struct mlx4_ib_tun_tx_buf) *
+ MLX4_NUM_TUNNEL_BUFS,
+ GFP_KERNEL);
+   if (!tun_qp->tx_ring) {
+   kfree(tun_qp->ring);
+

[PATCH for-next V2 01/22] IB/core: Reserve bits in enum ib_qp_create_flags for low-level driver use

2012-08-03 Thread Jack Morgenstein
Reserve bits 26-31 for internal use by low-level drivers. Two
such bits are used in the mlx4 driver SRIOV IB implementation.

These enum additions guarantee that the core layer will never use
these bits, so that low level drivers may safely make use of them.

Signed-off-by: Jack Morgenstein 
---
 include/rdma/ib_verbs.h |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 07996af..46bc045 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -614,6 +614,9 @@ enum ib_qp_type {
 enum ib_qp_create_flags {
IB_QP_CREATE_IPOIB_UD_LSO   = 1 << 0,
IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK   = 1 << 1,
+   /* reserve bits 26-31 for low level drivers' internal use */
+   IB_QP_CREATE_RESERVED_START = 1 << 26,
+   IB_QP_CREATE_RESERVED_END   = 1 << 31,
 };
 
 struct ib_qp_init_attr {
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH for-next V2 05/22] net/mlx4_core: Add proxy and tunnel QPs to the reserved QP area

2012-08-03 Thread Jack Morgenstein
In addition, pass the proxy and tunnel QP numbers to slaves so
the driver can perform sqp paravirtualization.

Signed-off-by: Jack Morgenstein 
---
 drivers/net/ethernet/mellanox/mlx4/fw.c|   14 +
 drivers/net/ethernet/mellanox/mlx4/fw.h|3 ++
 drivers/net/ethernet/mellanox/mlx4/main.c  |5 +++
 drivers/net/ethernet/mellanox/mlx4/qp.c|   29 +--
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |3 +-
 include/linux/mlx4/device.h|   13 -
 6 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c 
b/drivers/net/ethernet/mellanox/mlx4/fw.c
index c696484..0f557df 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -184,6 +184,8 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int 
slave,
 #define QUERY_FUNC_CAP_MCG_QUOTA_OFFSET0x28
 #define QUERY_FUNC_CAP_MAX_EQ_OFFSET   0x2c
 #define QUERY_FUNC_CAP_RESERVED_EQ_OFFSET  0X30
+#define QUERY_FUNC_CAP_BASE_TUNNEL_QPN_OFFSET  0X44
+#define QUERY_FUNC_CAP_BASE_PROXY_QPN_OFFSET   0X48
 
 #define QUERY_FUNC_CAP_FMR_FLAG0x80
 #define QUERY_FUNC_CAP_FLAG_RDMA   0x40
@@ -247,6 +249,12 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int 
slave,
size = dev->caps.num_mgms + dev->caps.num_amgms;
MLX4_PUT(outbox->buf, size, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
 
+   size = dev->caps.base_tunnel_sqpn + 8 * slave;
+   MLX4_PUT(outbox->buf, size, 
QUERY_FUNC_CAP_BASE_TUNNEL_QPN_OFFSET);
+
+   size = dev->caps.sqp_start + 8 * slave;
+   MLX4_PUT(outbox->buf, size, 
QUERY_FUNC_CAP_BASE_PROXY_QPN_OFFSET);
+
} else
err = -EINVAL;
 
@@ -312,6 +320,12 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, struct 
mlx4_func_cap *func_cap)
MLX4_GET(size, outbox, QUERY_FUNC_CAP_MCG_QUOTA_OFFSET);
func_cap->mcg_quota = size & 0xFF;
 
+   MLX4_GET(size, outbox, QUERY_FUNC_CAP_BASE_TUNNEL_QPN_OFFSET);
+   func_cap->base_tunnel_qpn = size & 0xFF;
+
+   MLX4_GET(size, outbox, QUERY_FUNC_CAP_BASE_PROXY_QPN_OFFSET);
+   func_cap->base_proxy_qpn = size & 0xFF;
+
for (i = 1; i <= func_cap->num_ports; ++i) {
err = mlx4_cmd_box(dev, 0, mailbox->dma, i, 1,
   MLX4_CMD_QUERY_FUNC_CAP,
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h 
b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 83fcbbf..ced1de5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -134,6 +134,9 @@ struct mlx4_func_cap {
int max_eq;
int reserved_eq;
int mcg_quota;
+   u32 base_qpn;
+   u32 base_tunnel_qpn;
+   u32 base_proxy_qpn;
u8  physical_port[MLX4_MAX_PORTS + 1];
u8  port_flags[MLX4_MAX_PORTS + 1];
 };
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c 
b/drivers/net/ethernet/mellanox/mlx4/main.c
index e8f8ebb..892f2d6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -387,6 +387,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct 
mlx4_dev_cap *dev_cap)
dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_ADDR] +
dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FC_EXCH];
 
+   dev->caps.sqp_demux = (mlx4_is_master(dev)) ? MLX4_MAX_NUM_SLAVES : 0;
return 0;
 }
 /*The function checks if there are live vf, return the num of them*/
@@ -544,6 +545,10 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
return -ENODEV;
}
 
+   /* Calculate our sqp_start */
+   dev->caps.sqp_start = func_cap.base_proxy_qpn;
+   dev->caps.base_tunnel_sqpn = func_cap.base_tunnel_qpn;
+
return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c 
b/drivers/net/ethernet/mellanox/mlx4/qp.c
index fb2b367..b8da72b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -406,7 +406,7 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 * We also reserve the MSB of the 24-bit QP number to indicate
 * that a QP is an XRC QP.
 */
-   dev->caps.sqp_start =
+   dev->caps.base_sqpn =
ALIGN(dev->caps.reserved_qps_cnt[MLX4_QP_REGION_FW], 8);
 
{
@@ -437,13 +437,36 @@ int mlx4_init_qp_table(struct mlx4_dev *dev)
 
}
 
+   /* Reserve 8 real SQPs in both native and SRIOV modes.
+   * In addition, in SRIOV mode, reserve 8 proxy SQPs per function
+   * (for all PFs and VFs), and 8 corresponding tunnel QPs.
+   * Each proxy SQP works opposite its own tunnel QP.
+   *
+   * The QPs are arranged as follows:
+   * 

[PATCH for-next V2 03/22] IB/core: Add ib_find_exact_cached_pkey() to search for 16-bit pkey match

2012-08-03 Thread Jack Morgenstein
When port pkey table potentially contains both full and partial
membership copies for the same pkey, we need a function to find
the exact (16-bit) pkey index.

This is particularly necessary when the master forwards QP1 MADS
sent by guests.  If the guest has sent the MAD with a limited
membership pkey, we wish to forward the MAD using the same limited
membership pkey.  Since master may have both the limited and
the full member pkeys in its table, we must make sure to retrieve
the limited membership pkey in this case.

This requires the 16-bit pkey lookup function (which includes the
membership bit).

Signed-off-by: Jack Morgenstein 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/cache.c |   32 
 include/rdma/ib_cache.h |   16 
 2 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 0f2f2b7..d8a8c83 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -198,6 +198,38 @@ int ib_find_cached_pkey(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_find_cached_pkey);
 
+int ib_find_exact_cached_pkey(struct ib_device *device,
+ u8port_num,
+ u16   pkey,
+ u16  *index)
+{
+   struct ib_pkey_cache *cache;
+   unsigned long flags;
+   int i;
+   int ret = -ENOENT;
+
+   if (port_num < start_port(device) || port_num > end_port(device))
+   return -EINVAL;
+
+   read_lock_irqsave(&device->cache.lock, flags);
+
+   cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+   *index = -1;
+
+   for (i = 0; i < cache->table_len; ++i)
+   if (cache->table[i] == pkey) {
+   *index = i;
+   ret = 0;
+   break;
+   }
+
+   read_unlock_irqrestore(&device->cache.lock, flags);
+
+   return ret;
+}
+EXPORT_SYMBOL(ib_find_exact_cached_pkey);
+
 int ib_get_cached_lmc(struct ib_device *device,
  u8port_num,
  u8*lmc)
diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h
index 00a2b8e..ad9a3c2 100644
--- a/include/rdma/ib_cache.h
+++ b/include/rdma/ib_cache.h
@@ -101,6 +101,22 @@ int ib_find_cached_pkey(struct ib_device*device,
u16 *index);
 
 /**
+ * ib_find_exact_cached_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs. Comparison uses the FULL 16 bits (incl membership bit)
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the cached PKey table where the PKey was found.
+ *
+ * ib_find_exact_cached_pkey() searches the specified PKey table in
+ * the local software cache.
+ */
+int ib_find_exact_cached_pkey(struct ib_device*device,
+ u8   port_num,
+ u16  pkey,
+ u16 *index);
+
+/**
  * ib_get_cached_lmc - Returns a cached lmc table entry
  * @device: The device to query.
  * @port_num: The port number of the device to query.
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH for-next V2 02/22] IB/core: change pkey table lookups to support full and partial membership for the same pkey

2012-08-03 Thread Jack Morgenstein
Enhance the cached and non-cached pkey table lookups to enable limited and full
members of the same pkey to co-exist in the pkey table.

This is necessary for SRIOV to allow for a scheme where some guests would have 
the full
membership pkey in their virtual pkey table, where other guests on the same 
hypervisor
would have the limited one. In that sense, its an extension of the IBTA model 
for
non virtualized nodes.

To accomplish this, we need both the limited and full membership pkeys to be 
present
in the master's (hypervisor physical port) pkey table.

The algorithm for supporting pkey tables which contain both the limited and the 
full
membership versions of the same pkey works as follows:

When scanning the pkey table for a 15 bit pkey:

A. If there is a full member version of that pkey anywhere
in the table, return its index (even if a limited-member
version of the pkey exists earlier in the table).

B. If the full member version is not in the table,
but the limited-member version is in the table,
return the index of the limited pkey.

Signed-off-by: Liran Liss 
Signed-off-by: Jack Morgenstein 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/cache.c  |   14 +++---
 drivers/infiniband/core/device.c |   17 +
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 9353992..0f2f2b7 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -167,6 +167,7 @@ int ib_find_cached_pkey(struct ib_device *device,
unsigned long flags;
int i;
int ret = -ENOENT;
+   int partial_ix = -1;
 
if (port_num < start_port(device) || port_num > end_port(device))
return -EINVAL;
@@ -179,10 +180,17 @@ int ib_find_cached_pkey(struct ib_device *device,
 
for (i = 0; i < cache->table_len; ++i)
if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
-   *index = i;
-   ret = 0;
-   break;
+   if (cache->table[i] & 0x8000) {
+   *index = i;
+   ret = 0;
+   break;
+   } else
+   partial_ix = i;
}
+   if (ret && partial_ix >= 0) {
+   *index = partial_ix;
+   ret = 0;
+   }
 
read_unlock_irqrestore(&device->cache.lock, flags);
 
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index e711de4..a645c68 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -707,18 +707,27 @@ int ib_find_pkey(struct ib_device *device,
 {
int ret, i;
u16 tmp_pkey;
+   int partial_ix = -1;
 
for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; 
++i) {
ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
if (ret)
return ret;
-
if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
-   *index = i;
-   return 0;
+   /* if there is full-member pkey take it.*/
+   if (tmp_pkey & 0x8000) {
+   *index = i;
+   return 0;
+   }
+   if (partial_ix < 0)
+   partial_ix = i;
}
}
-
+   /*no full-member, if exists take the limited*/
+   if (partial_ix >= 0) {
+   *index = partial_ix;
+   return 0;
+   }
return -ENOENT;
 }
 EXPORT_SYMBOL(ib_find_pkey);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH for-next V2 00/22] Add SRIOV support for IB interfaces

2012-08-03 Thread Jack Morgenstein
s in 
mlx4_phys_caps
struct and paravirtualize them

2. patches (new numbering) "5: net/mlx4_core: Add proxy and tunnel QPs to the 
reserved QP area"
   and "8: IB/mlx4: SRIOV multiplex and demultiplex MADs"
   were adjusted to account for changes introduced by commits to mlx4_core from 
other gits in the
   kernel V3.5 merge 

3. A fix to event processing in the V2 patch set was added to patch
   "18: IB/mlx4: Miscellaneous adjustments to SRIOV IB support" :

   Add spinlock to slave_event(), since it is called both in interrupt
   context and in process context (due to patch 6, and also if smp_snoop
   is used).
   This fix was found and implemented by Saeed Mahameed 
 
4. Two new patches were added to V2:
   21: {NET,IB}/mlx4: Modify proxy/tunnel QP mechanism so that guests do no 
calculations
   22: IB/mlx4: Create pv contexts for active VFs when PF (master) ib driver 
initializes

5. The workaround for the KVM IRQ management problem is no longer needed.  The 
KVM group
   fixed the problem in kernel 3.5-rc7.

Changes for V1
--

1. librdmacm now supports multiple VF/PF on the same host (patch 29).
2. Several patches cleaned up (these were indicated in the V0 changelogs).
   Major cleanups in patch 22 and patch 24.
3. Eliminated code duplication in Port Management Change event code (patch 8).
4. Now use pr_debug, instead of mlx4_ib_debug, and there is no module parameter
   (Roland's recommendation).
5. mlx4_master_func_num() to get the master's "slave_id", to make code more 
readable.
6. Fixed illegal use of port num field for a force-loopback bit in ib_ah 
structure
   (V0 patch 2 -- eliminated). The force-loopback bit is now set for Tunnel QPs 
in 
   mlx4_ib_post send (patch 17).
7. New patch 2 (not related to 6 above) to reserve bits in enum 
ib_qp_create_flags.
8. V0 patch 26 is now rolled into V1 patch 22. This allowed us to eliminate 
function
   mlx4_ib_indexed_gid from patch 22 (replaced by using __mlx4_ib_query_gid() 
from V0 patch 26).


Amir Vadai (1):
  IB/mlx4: Add CM paravirtualization

Erez Shitrit (1):
  IB/sa: Add GuidInfoRecord query support.

Jack Morgenstein (26):
  net/mlx4_core: Pass an invalid PCI id number to VFs
  IB/core: Reserve bits in enum ib_qp_create_flags for low-level driver
use
  IB/mlx4: Add debug printouts
  IB/core: change pkey table lookups to support full and partial
membership for the same pkey
  IB/core: Add ib_find_exact_cached_pkey() to search for 16-bit pkey
match
  IB/core: move macros from cm_msgs.h to ib_cm.h
  {NET,IB}/mlx4: Use port management change event instead of smp_snoop
  net/mlx4_core: For SRIOV, initialize ib port-capabilities for all
slaves
  net/mlx4_core: Implement mechanism for reserved qkeys
  net/mlx4_core: Allow guests to support IB ports
  {NET,IB}/mlx4_core: place phys gid and pkey tbl sizes in
mlx4_phys_caps struct and paravirtualize them
  IB/mlx4: SRIOV IB context objects and proxy/tunnel sqp support
  net/mlx4_core: Add proxy and tunnel QPs to the reserved QP area
  IB/mlx4: Initialize SRIOV IB support for slaves in master context
  {NET,IB}/mlx4: Implement QP paravirtualization and maintain
phys_pkey_cache for smp_snoop
  IB/mlx4: SRIOV multiplex and demultiplex MADs
  {NET,IB}/mlx4: MAD_IFC paravirtualization
  net/mlx4_core: Add IB port-state machine, and port mgmt event
propagation infrastructure
  {NET,IB}/mlx4: Add alias_guid mechanism
  IB/mlx4: Propagate pkey and guid change port management events to
slaves
  IB/mlx4: Add iov directory in sysfs under the ib device
  net/mlx4_core: Adjustments to SET_PORT for SRIOV-IB
  net/mlx4_core: INIT/CLOSE port logic for IB ports in SRIOV mode
  IB/mlx4: Miscellaneous adjustments to SRIOV IB support
  {NET,IB}/mlx4: Activate SRIOV mode for IB
  {NET,IB}/mlx4: Paravirtualize Node Guids for slaves.

Oren Duer (1):
  IB/mlx4: Added Multicast Groups (MCG) para-virtualization for SRIOV

 drivers/infiniband/core/cache.c|   42 +-
 drivers/infiniband/core/cm_msgs.h  |   12 -
 drivers/infiniband/core/device.c   |   17 +-
 drivers/infiniband/core/sa_query.c |  133 ++
 drivers/infiniband/hw/mlx4/Makefile|2 +-
 drivers/infiniband/hw/mlx4/alias_GUID.c|  688 
 drivers/infiniband/hw/mlx4/cm.c|  437 +
 drivers/infiniband/hw/mlx4/cq.c|   31 +-
 drivers/infiniband/hw/mlx4/mad.c   | 1684 +++-
 drivers/infiniband/hw/mlx4/main.c  |  285 +++-
 drivers/infiniband/hw/mlx4/mcg.c   | 1254 +++
 drivers/infiniband/hw/mlx4/mlx4_ib.h   |  360 +-
 drivers/infiniband/hw/mlx4/qp.c|  651 +++-
 drivers/infiniband/hw/mlx4/sysfs.c |  794 +
 drivers/net/ethernet/mellanox/mlx4/cmd.c   |  190 +++-
 drivers/net/ethernet/mellanox/m

[PATCH] IB/mlx4: fix possible deadlock with sm_lock spinlock

2012-08-03 Thread Jack Morgenstein
The sm_lock spinlock is taken in the process context by mlx4_ib_modify_device,
and in the interrupt context by update_sm_ah.

Need to take that spinlock with irqsave, and release it with irqrestore.

>From a stack trace with LOCKDEP configured in the kernel:
[ INFO: inconsistent lock state ]
3.5.0+ #20 Not tainted
inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
swapper/0/0 [HC1[1]:SC0[0]:HE0:SE1] takes:
(&(&ibdev->sm_lock)->rlock){?.+...}, at: [] 
update_sm_ah+0xad/0x100 [mlx4_ib]
{HARDIRQ-ON-W} state was registered at:
  [] mark_irqflags+0x120/0x190
  [] __lock_acquire+0x307/0x4c0
  [] lock_acquire+0xb1/0x150
  [] _raw_spin_lock+0x41/0x50
  [] mlx4_ib_modify_device+0x63/0x240 [mlx4_ib]
  [] ib_modify_device+0x1c/0x20 [ib_core]
  [] set_node_desc+0x83/0xc0 [ib_core]
  [] dev_attr_store+0x20/0x30
  [] sysfs_write_file+0xe6/0x170
  [] vfs_write+0xc8/0x190
  [] sys_write+0x51/0x90
  [] system_call_fastpath+0x16/0x1b

...
*** DEADLOCK ***

1 lock held by swapper/0/0:

stack backtrace:
Pid: 0, comm: swapper/0 Not tainted 3.5.0+ #20
Call Trace:
  [] print_usage_bug+0x18a/0x190
[] ? print_irq_inversion_bug+0x210/0x210
[] mark_lock_irq+0xf2/0x280
[] mark_lock+0x150/0x240
[] mark_irqflags+0x16f/0x190
[] __lock_acquire+0x307/0x4c0
[] ? update_sm_ah+0xad/0x100 [mlx4_ib]
[] lock_acquire+0xb1/0x150
[] ? update_sm_ah+0xad/0x100 [mlx4_ib]
[] _raw_spin_lock+0x41/0x50
[] ? update_sm_ah+0xad/0x100 [mlx4_ib]
[] ? ib_create_ah+0x1a/0x40 [ib_core]
[] update_sm_ah+0xad/0x100 [mlx4_ib]
[] ? is_module_address+0x23/0x30
[] handle_port_mgmt_change_event+0xeb/0x150 [mlx4_ib]
[] mlx4_ib_event+0x117/0x160 [mlx4_ib]
[] ? _raw_spin_lock_irqsave+0x61/0x70
[] mlx4_dispatch_event+0x6c/0x90 [mlx4_core]
[] mlx4_eq_int+0x500/0x950 [mlx4_core]

Reported by: Or Gerlitz 
Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/mad.c  |   16 ++--
 drivers/infiniband/hw/mlx4/main.c |7 ---
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index c27141f..9c2ae7e 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -125,6 +125,7 @@ static void update_sm_ah(struct mlx4_ib_dev *dev, u8 
port_num, u16 lid, u8 sl)
 {
struct ib_ah *new_ah;
struct ib_ah_attr ah_attr;
+   unsigned long flags;
 
if (!dev->send_agent[port_num - 1][0])
return;
@@ -139,11 +140,11 @@ static void update_sm_ah(struct mlx4_ib_dev *dev, u8 
port_num, u16 lid, u8 sl)
if (IS_ERR(new_ah))
return;
 
-   spin_lock(&dev->sm_lock);
+   spin_lock_irqsave(&dev->sm_lock, flags);
if (dev->sm_ah[port_num - 1])
ib_destroy_ah(dev->sm_ah[port_num - 1]);
dev->sm_ah[port_num - 1] = new_ah;
-   spin_unlock(&dev->sm_lock);
+   spin_unlock_irqrestore(&dev->sm_lock, flags);
 }
 
 /*
@@ -197,13 +198,15 @@ static void smp_snoop(struct ib_device *ibdev, u8 
port_num, struct ib_mad *mad,
 static void node_desc_override(struct ib_device *dev,
   struct ib_mad *mad)
 {
+   unsigned long flags;
+
if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
 mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
-   spin_lock(&to_mdev(dev)->sm_lock);
+   spin_lock_irqsave(&to_mdev(dev)->sm_lock, flags);
memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64);
-   spin_unlock(&to_mdev(dev)->sm_lock);
+   spin_unlock_irqrestore(&to_mdev(dev)->sm_lock, flags);
}
 }
 
@@ -213,6 +216,7 @@ static void forward_trap(struct mlx4_ib_dev *dev, u8 
port_num, struct ib_mad *ma
struct ib_mad_send_buf *send_buf;
struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
int ret;
+   unsigned long flags;
 
if (agent) {
send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
@@ -225,13 +229,13 @@ static void forward_trap(struct mlx4_ib_dev *dev, u8 
port_num, struct ib_mad *ma
 * wrong following the IB spec strictly, but we know
 * it's OK for our devices).
 */
-   spin_lock(&dev->sm_lock);
+   spin_lock_irqsave(&dev->sm_lock, flags);
memcpy(send_buf->mad, mad, sizeof *mad);
if ((send_buf->ah = dev->sm_ah[port_num - 1]))
ret = ib_post_send_mad(send_buf, NULL);
else
ret = -EINVAL;
-   spin_unlock(&dev->sm_lock);
+   spin_unlock_irqrestore(&dev->sm_lock, flags);
 
if (ret

Rebasing your for-next branch to 3.5-rc7

2012-07-16 Thread Jack Morgenstein
Hello Roland,

I am back from vacation.  Hopefully, you are progressing with your review of
the huge patch set I submitted -- please let me know if there is anything
I can do to help.

I noticed that the KVM group submitted a fix for the SRIOV IRQ guest problem
(Point #3 of the "Some Limitations" section of the patch set cover letter):

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=a76beb14123a69ca080f5a5425e28b786d62318d

This fix is incorporated in 3.5-rc7.  If you rebase, we can get rid of the 
workaround.

Thanks!

-Jack

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: linux-next: manual merge of the net-next tree with the infiniband tree

2012-07-16 Thread Jack Morgenstein
On Thursday 12 July 2012 05:13, Stephen Rothwell wrote:
> Hi all,
> 
> Today's linux-next merge of the net-next tree got a conflict in
> include/linux/mlx4/device.h between commit 396f2feb05d7 ("mlx4_core:
> Implement mechanism for reserved Q_Keys") from the infiniband tree and
> commit 0ff1fb654bec ("{NET, IB}/mlx4: Add device managed flow steering
> firmware API") from the net-next tree.
> 
> Just context changes.  I fixed it up (see below) and can carry the fix
> as necessary.

Thanks, Stephen!

Your merge looks fine. Ack.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: linux-next: manual merge of the net-next tree with the infiniband tree

2012-07-16 Thread Jack Morgenstein
On Thursday 12 July 2012 05:09, Stephen Rothwell wrote:
> Hi all,
> 
> Today's linux-next merge of the net-next tree got a conflict in
> drivers/net/ethernet/mellanox/mlx4/main.c between commit 6634961c14d3
> ("mlx4: Put physical GID and P_Key table sizes in mlx4_phys_caps struct
> and paravirtualize them") from the infiniband tree and commit
> 0ff1fb654bec ("{NET, IB}/mlx4: Add device managed flow steering firmware
> API") from the net-next tree.
> 
> Just context changes (I think).  I have fixed it up (see below) and can
> carry the fix as necessary.

Thanks, Stephen!

Ack for IB side.

-Jack
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH for-next V1 04/29] IB/core: change pkey table lookups to support full and partial membership for the same pkey

2012-06-19 Thread Jack Morgenstein
Enhance the cached and non-cached pkey table lookups to enable limited and full
members of the same pkey to co-exist in the pkey table.

This is necessary for SRIOV to allow for a scheme where some guests would have 
the full
membership pkey in their virtual pkey table, where other guests on the same 
hypervisor
would have the limited one. In that sense, its an extension of the IBTA model 
for
non virtualized nodes.

To accomplish this, we need both the limited and full membership pkeys to be 
present
in the master's (hypervisor physical port) pkey table.

The algorithm for supporting pkey tables which contain both the limited and the 
full
membership versions of the same pkey works as follows:

When scanning the pkey table for a 15 bit pkey:

A. If there is a full member version of that pkey anywhere
in the table, return its index (even if a limited-member
version of the pkey exists earlier in the table).

B. If the full member version is not in the table,
but the limited-member version is in the table,
return the index of the limited pkey.

Signed-off-by: Liran Liss 
Signed-off-by: Jack Morgenstein 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/cache.c  |   14 +++---
 drivers/infiniband/core/device.c |   17 +
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 9353992..0f2f2b7 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -167,6 +167,7 @@ int ib_find_cached_pkey(struct ib_device *device,
unsigned long flags;
int i;
int ret = -ENOENT;
+   int partial_ix = -1;
 
if (port_num < start_port(device) || port_num > end_port(device))
return -EINVAL;
@@ -179,10 +180,17 @@ int ib_find_cached_pkey(struct ib_device *device,
 
for (i = 0; i < cache->table_len; ++i)
if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
-   *index = i;
-   ret = 0;
-   break;
+   if (cache->table[i] & 0x8000) {
+   *index = i;
+   ret = 0;
+   break;
+   } else
+   partial_ix = i;
}
+   if (ret && partial_ix >= 0) {
+   *index = partial_ix;
+   ret = 0;
+   }
 
read_unlock_irqrestore(&device->cache.lock, flags);
 
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index e711de4..a645c68 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -707,18 +707,27 @@ int ib_find_pkey(struct ib_device *device,
 {
int ret, i;
u16 tmp_pkey;
+   int partial_ix = -1;
 
for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; 
++i) {
ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
if (ret)
return ret;
-
if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
-   *index = i;
-   return 0;
+   /* if there is full-member pkey take it.*/
+   if (tmp_pkey & 0x8000) {
+   *index = i;
+   return 0;
+   }
+   if (partial_ix < 0)
+   partial_ix = i;
}
}
-
+   /*no full-member, if exists take the limited*/
+   if (partial_ix >= 0) {
+   *index = partial_ix;
+   return 0;
+   }
return -ENOENT;
 }
 EXPORT_SYMBOL(ib_find_pkey);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH for-next V1 13/29] IB/mlx4: SRIOV IB context objects and proxy/tunnel sqp support

2012-06-19 Thread Jack Morgenstein
1. Introduce the basic sriov parvirtualization context objects
   for multiplexing and demultiplexing MADs.
2. Introduce support for the new proxy and tunnel QP types.

This patch introduces the objects required by the master
for managing QP paravirtualization for guests.

struct mlx4_ib_sriov{} is created by the master only.
It is a container for the following:
1. All the info required by the PPF to multiplex and de-multiplex MADs
   (including those from the PF). (struct mlx4_ib_demux_ctx demux)
2. All the info required to manage alias GUIDs (i.e., the GUID at
   index 0 that each guest perceives.  In fact, this is not the
   GUID which is actually at index 0, but is, in fact, the GUID
   which is at index[] in the physical table.

3. structures which are used to manage CM paravirtualization
4. structures for managing the real special QPs when running in
   sriov mode.
   The real SQPs are controlled by the PPF in this case.  All SQPs
   created and controlled by the ib core layer are proxy sqps

struct mlx4_ib_demux_ctx{} contains the information per port needed
to manage paravirtualization.
This includes:
1. All multicast paravirt info
2. All tunnel-qp paravirt info for the port.
3. GUID-table and GUID-prefix for the port
4. work queues.

struct mlx4_ib_demux_pv_ctx{} contains all the info for managing
the paravirtualized QPs for one slave/port.

struct mlx4_ib_demux_pv_qp{} contains the info need to run an
individual QP (either tunnel qp or real SQP).

Note:  We made use of the 2 most-significant bits in enum
mlx4_ib_qp_flags (based upon enum ib_qp_create_flags (ib_verbs.h))

We need these bits in the low-level driver for internal purposes.

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/cq.c  |   31 ++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  128 +++-
 drivers/infiniband/hw/mlx4/qp.c  |  616 ++
 include/linux/mlx4/device.h  |1 +
 include/linux/mlx4/qp.h  |3 +-
 5 files changed, 702 insertions(+), 77 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 6d4ef71..342fabd 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -547,6 +547,26 @@ static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 
checksum)
checksum == cpu_to_be16(0x);
 }
 
+static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, 
struct ib_wc *wc,
+  unsigned tail, struct mlx4_cqe *cqe)
+{
+   struct mlx4_ib_proxy_sqp_hdr *hdr;
+
+   ib_dma_sync_single_for_cpu(qp->ibqp.device,
+  qp->sqp_proxy_rcv[tail].map,
+  sizeof(struct mlx4_ib_proxy_sqp_hdr),
+  DMA_FROM_DEVICE);
+   hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr);
+   wc->pkey_index  = be16_to_cpu(hdr->tun.pkey_index);
+   wc->slid= be16_to_cpu(hdr->tun.slid_mac_47_32);
+   wc->sl  = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12);
+   wc->src_qp  = be32_to_cpu(hdr->tun.flags_src_qp) & 0xFF;
+   wc->wc_flags   |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0;
+   wc->dlid_path_bits = 0;
+
+   return 0;
+}
+
 static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
struct mlx4_ib_qp **cur_qp,
struct ib_wc *wc)
@@ -559,6 +579,7 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
int is_error;
u32 g_mlpath_rqpn;
u16 wqe_ctr;
+   unsigned tail = 0;
 
 repoll:
cqe = next_cqe_sw(cq);
@@ -634,7 +655,8 @@ repoll:
mlx4_ib_free_srq_wqe(srq, wqe_ctr);
} else {
wq= &(*cur_qp)->rq;
-   wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+   tail  = wq->tail & (wq->wqe_cnt - 1);
+   wc->wr_id = wq->wrid[tail];
++wq->tail;
}
 
@@ -717,6 +739,13 @@ repoll:
break;
}
 
+   if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) {
+   if ((*cur_qp)->mlx4_ib_qp_type &
+   (MLX4_IB_QPT_PROXY_SMI_OWNER |
+MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
+   return use_tunnel_data(*cur_qp, cq, wc, tail, 
cqe);
+   }
+
wc->slid   = be16_to_cpu(cqe->rlid);
g_mlpath_rqpn  = be32_to_cpu(cqe->g_mlpath_rqpn);
wc->src_qp = g_mlpath_rqpn & 0xff;
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h 
b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 23bfbf9..55d3bfb 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@

[PATCH for-next V1 23/29] IB/mlx4: Propagate pkey and guid change port management events to slaves

2012-06-19 Thread Jack Morgenstein
pkey change and guid change events are not of interest to all slaves,
but only to those slaves which "see" the table slots whose contents
have change.

For example, if the guid at port 1, index 5 has changed in the
PPF, we wish to propagate the gid-change event only to the function
which has that guid index mapped to its port/guid table (in this case
it is slave #5). Other functions should not get the event,
since the event does not affect them.

Similarly with pkeys -- pkey change events are forwarded
only to slaves which have that pkey index mapped to their
virtual pkey table.

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/mad.c|  162 +--
 drivers/net/ethernet/mellanox/mlx4/fw.c |6 +
 2 files changed, 161 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 1d618af..16c6d55 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -54,6 +54,15 @@ enum {
 #define MLX4_TUN_IS_RECV(a)  (((a) >>  MLX4_TUN_SEND_WRID_SHIFT) & 0x1)
 #define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3)
 
+ /* Port mgmt change event handling */
+
+#define GET_BLK_PTR_FROM_EQE(eqe) 
be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr)
+#define GET_MASK_FROM_EQE(eqe) 
be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask)
+#define NUM_IDX_IN_PKEY_TBL_BLK 32
+#define GUID_TBL_ENTRY_SIZE 8 /* size in bytes */
+#define GUID_TBL_BLK_NUM_ENTRIES 8
+#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES)
+
 struct mlx4_mad_rcv_buf {
struct ib_grh grh;
u8 payload[256];
@@ -76,6 +85,9 @@ struct mlx4_rcv_tunnel_mad {
 } __packed;
 
 static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+   int block, u32 change_bitmap);
 
 __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
 {
@@ -220,8 +232,7 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, 
struct ib_mad *mad,
handle_client_rereg_event(dev, port_num);
 
if (prev_lid != lid)
-   mlx4_ib_dispatch_event(dev, port_num,
-  IB_EVENT_LID_CHANGE);
+   handle_lid_change_event(dev, port_num);
break;
 
case IB_SMP_ATTR_PKEY_TABLE:
@@ -231,6 +242,9 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, 
struct ib_mad *mad,
break;
}
 
+   /* at this point, we are running in the master.
+* Slaves do not receive SMPs.
+*/
bn  = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 
0x;
base = (__be16 *) &(((struct ib_smp *)mad)->data[0]);
pkey_change_bitmap = 0;
@@ -248,10 +262,13 @@ static void smp_snoop(struct ib_device *ibdev, u8 
port_num, struct ib_mad *mad,
 "block=0x%x, change_bitmap=0x%x\n",
 port_num, bn, pkey_change_bitmap);
 
-   if (pkey_change_bitmap)
+   if (pkey_change_bitmap) {
mlx4_ib_dispatch_event(dev, port_num,
   IB_EVENT_PKEY_CHANGE);
-
+   if (!dev->sriov.is_going_down)
+   __propagate_pkey_ev(dev, port_num, bn,
+   pkey_change_bitmap);
+   }
break;
 
case IB_SMP_ATTR_GUID_INFO:
@@ -259,12 +276,56 @@ static void smp_snoop(struct ib_device *ibdev, u8 
port_num, struct ib_mad *mad,
if (!mlx4_is_master(dev->dev))
mlx4_ib_dispatch_event(dev, port_num,
   IB_EVENT_GID_CHANGE);
+   /*if master, notify relevant slaves*/
+   if (mlx4_is_master(dev->dev) &&
+   !dev->sriov.is_going_down) {
+   bn = be32_to_cpu(((struct ib_smp 
*)mad)->attr_mod);
+   mlx4_ib_update_cache_on_guid_change(dev, bn, 
port_num,
+   (u8 
*)(&((struct ib_smp *)mad)->data));
+   mlx4_ib_notify_slaves_on_guid_change(dev, bn, 
port_num,
+   

[PATCH for-next V1 19/29] IB/mlx4: Added Multicast Groups (MCG) para-virtualization for SRIOV

2012-06-19 Thread Jack Morgenstein
From: Oren Duer 

MCG para-virtualization support includes:
- Creating multicast groups by VFs, and keeping accounting of them
- Leaving multicast groups by VFs
- SM will only be updated with real changes in the overall picture of MCGs 
status
- Creation of MGID=0 groups (let SM choose MGID)

Note that the MCG module maintains its own internal MCG object reference
counts. The reason for this is that the IB core is used to track only
the multicast groups joins generated by the PF it runs over.
The PF ib-core layer is unaware of slaves, so it cannot be used to keep
track of MCG joins they generate.

Signed-off-by: Oren Duer 
Signed-off-by: Eli Cohen 
Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/Makefile  |2 +-
 drivers/infiniband/hw/mlx4/mad.c |   60 ++-
 drivers/infiniband/hw/mlx4/main.c|   18 +-
 drivers/infiniband/hw/mlx4/mcg.c | 1187 ++
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   30 +
 5 files changed, 1285 insertions(+), 12 deletions(-)
 create mode 100644 drivers/infiniband/hw/mlx4/mcg.c

diff --git a/drivers/infiniband/hw/mlx4/Makefile 
b/drivers/infiniband/hw/mlx4/Makefile
index 70f09c7..20d627d 100644
--- a/drivers/infiniband/hw/mlx4/Makefile
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_MLX4_INFINIBAND)  += mlx4_ib.o
 
-mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o
+mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 538003e..513bb0d 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -75,6 +75,14 @@ struct mlx4_rcv_tunnel_mad {
struct ib_mad mad;
 } __packed;
 
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num);
+
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
+{
+   return cpu_to_be64(atomic_inc_return(&ctx->tid)) |
+   cpu_to_be64(0xff00LL);
+}
+
 int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
 void *in_mad, void *response_mad)
@@ -209,8 +217,7 @@ static void smp_snoop(struct ib_device *ibdev, u8 port_num, 
struct ib_mad *mad,
 pinfo->neighbormtu_mastersmsl & 0xf);
 
if (pinfo->clientrereg_resv_subnetto & 0x80)
-   mlx4_ib_dispatch_event(dev, port_num,
-  
IB_EVENT_CLIENT_REREGISTER);
+   handle_client_rereg_event(dev, port_num);
 
if (prev_lid != lid)
mlx4_ib_dispatch_event(dev, port_num,
@@ -305,7 +312,17 @@ static void forward_trap(struct mlx4_ib_dev *dev, u8 
port_num, struct ib_mad *ma
 static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int 
slave,
 struct ib_sa_mad 
*sa_mad)
 {
-   return 0;
+   int ret = 0;
+
+   /* dispatch to different sa handlers */
+   switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+   case IB_SA_ATTR_MC_MEMBER_REC:
+   ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad);
+   break;
+   default:
+   break;
+   }
+   return ret;
 }
 
 int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid)
@@ -765,6 +782,16 @@ void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
}
 }
 
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num)
+{
+   /* re-configure the mcg's */
+   if (mlx4_is_master(dev->dev)) {
+   if (!dev->sriov.is_going_down)
+   mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 
1], 0);
+   }
+   mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER);
+}
+
 void handle_port_mgmt_change_event(struct work_struct *work)
 {
struct ib_event_work *ew = container_of(work, struct ib_event_work, 
work);
@@ -794,8 +821,7 @@ void handle_port_mgmt_change_event(struct work_struct *work)
mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
 
if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK)
-   mlx4_ib_dispatch_event(dev, port,
-  IB_EVENT_CLIENT_REREGISTER);
+   handle_client_rereg_event(dev, port);
break;
 
case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE:
@@ -865,7 +891,17 @@ static int mlx4_ib_post_pv_qp_buf(struct 
mlx4_ib_demux_pv_ctx *ctx,
 static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port,
int slave, struct ib_sa_mad *sa_mad)
 {
-   return 0;
+   int ret = 0;
+
+   /* dispatch to different sa handlers */
+

[PATCH for-next V1 24/29] IB/mlx4: Add iov directory in sysfs under the ib device

2012-06-19 Thread Jack Morgenstein
This directory is added only for the master -- slaves do
not have it.

The sysfs iov directory is used to manage and examine the port
pkey and guid paravirtualization.

Under iov/ports, the administrator may examine the gid and pkey
tables as they are present in the device (and as are seen in the
"network view" presented to the SM).

Under the iov/ directories, the admin may map
the index numbers in the physical tables (as under iov/ports)
to the paravirtualized index numbers that guests see.

Thus, for example, if the administrator, for port 1 on guest 2, say,
maps physical pkey index 10 to virtual index 1, that guest, whenever
it uses its pkey index 1 will actually be using the real pkey index 10.

Based on a patch from Erez Shitrit 

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/Makefile |2 +-
 drivers/infiniband/hw/mlx4/alias_GUID.c |6 +-
 drivers/infiniband/hw/mlx4/mad.c|9 +
 drivers/infiniband/hw/mlx4/mcg.c|   67 +++
 drivers/infiniband/hw/mlx4/mlx4_ib.h|   43 ++
 drivers/infiniband/hw/mlx4/sysfs.c  |  794 +++
 6 files changed, 917 insertions(+), 4 deletions(-)
 create mode 100644 drivers/infiniband/hw/mlx4/sysfs.c

diff --git a/drivers/infiniband/hw/mlx4/Makefile 
b/drivers/infiniband/hw/mlx4/Makefile
index 31d4c8a..f4213b3 100644
--- a/drivers/infiniband/hw/mlx4/Makefile
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_MLX4_INFINIBAND)  += mlx4_ib.o
 
-mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o 
alias_GUID.o
+mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o 
alias_GUID.o sysfs.o
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c 
b/drivers/infiniband/hw/mlx4/alias_GUID.c
index 9db1581..c4252e7 100644
--- a/drivers/infiniband/hw/mlx4/alias_GUID.c
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -113,7 +113,7 @@ static __be64 get_cached_alias_guid(struct mlx4_ib_dev 
*dev, int port, int index
 }
 
 
-static ib_sa_comp_mask get_aguid_comp_mask_from_ix(int index)
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index)
 {
return IB_SA_COMP_MASK(4 + index);
 }
@@ -259,7 +259,7 @@ static void aliasguid_query_handler(int status,
/* Mark the record as not assigned, and let it
 * be sent again in the next work sched.*/
rec->status = MLX4_GUID_INFO_STATUS_IDLE;
-   rec->guid_indexes |= 
get_aguid_comp_mask_from_ix(i);
+   rec->guid_indexes |= 
mlx4_ib_get_aguid_comp_mask_from_ix(i);
}
} else {
   /* properly assigned record. */
@@ -337,7 +337,7 @@ static void invalidate_guid_record(struct mlx4_ib_dev *dev, 
u8 port, int index)
MLX4_GUID_NONE_ASSIGN == dev->sriov.alias_guid.
ports_guid[port - 1].all_rec_per_port[index].ownership)
continue;
-   comp_mask |= get_aguid_comp_mask_from_ix(i);
+   comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
}
dev->sriov.alias_guid.ports_guid[port - 1].
all_rec_per_port[index].guid_indexes = comp_mask;
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 16c6d55..8f10da7 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -1960,6 +1960,11 @@ int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev)
mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n");
goto paravirt_err;
}
+   err = mlx4_ib_device_register_sysfs(dev);
+   if (err) {
+   mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n");
+   goto sysfs_err;
+   }
 
mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 
clients\n",
 dev->dev->caps.sqp_demux);
@@ -1986,6 +1991,9 @@ demux_err:
mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
--i;
}
+   mlx4_ib_device_unregister_sysfs(dev);
+
+sysfs_err:
mlx4_ib_destroy_alias_guid_service(dev);
 
 paravirt_err:
@@ -2016,5 +2024,6 @@ void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev)
 
mlx4_ib_cm_paravirt_clean(dev, -1);
mlx4_ib_destroy_alias_guid_service(dev);
+   mlx4_ib_device_unregister_sysfs(dev);
}
 }
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
index 1ee2e3a..3c3b54c 100644
--- a/drivers/infiniband/hw/mlx4/mcg.c
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -110,6 +110,7 @@ struct mcast_group {
__be64  last_req_tid;
 
charname[33]; /* MGID string */
+   struct device_attribute dentry;
 
  

[PATCH for-next V1 27/29] IB/mlx4: Miscellaneous adjustments to SRIOV IB support

2012-06-19 Thread Jack Morgenstein
1. allow only master to change node description
2. prevent ah leakage in send mads
3. take device part number from PCI structure, so that
   guests see the VF part number (and not the PF part number)
4. place the device revision ID into caps structure at startup
5. SET_PORT in update_gids_task needs to go through wrapper on master.
6. in mlx4_ib_event, PORT_MGMT_EVENT needs be handled in a work queue
   on the master, since it propagates events to slaves using GEN_EQE
7. Do not support FMR on slaves.

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/mad.c  |6 +-
 drivers/infiniband/hw/mlx4/main.c |   26 ++
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 8f10da7..6a485d0 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -706,7 +706,9 @@ static int ib_process_mad(struct ib_device *ibdev, int 
mad_flags, u8 port_num,
if (!out_mad->mad_hdr.status) {
if (!(to_mdev(ibdev)->dev->caps.flags & 
MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV))
smp_snoop(ibdev, port_num, in_mad, prev_lid);
-   node_desc_override(ibdev, out_mad);
+   /* slaves get node desc from FW */
+   if (!mlx4_is_slave(to_mdev(ibdev)->dev))
+   node_desc_override(ibdev, out_mad);
}
 
/* set return bit in status of directed route responses */
@@ -789,6 +791,8 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int 
mad_flags, u8 port_num,
 static void send_handler(struct ib_mad_agent *agent,
 struct ib_mad_send_wc *mad_send_wc)
 {
+   if (mad_send_wc->send_buf->context[0])
+   ib_destroy_ah(mad_send_wc->send_buf->context[0]);
ib_free_send_mad(mad_send_wc->send_buf);
 }
 
diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 664e053..f072572 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -138,7 +138,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 
props->vendor_id   = be32_to_cpup((__be32 *) (out_mad->data + 
36)) &
0xff;
-   props->vendor_part_id  = be16_to_cpup((__be16 *) (out_mad->data + 
30));
+   props->vendor_part_id  = dev->dev->pdev->device;
props->hw_ver  = be32_to_cpup((__be32 *) (out_mad->data + 
32));
memcpy(&props->sys_image_guid, out_mad->data +  4, 8);
 
@@ -477,6 +477,9 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, 
int mask,
if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
return 0;
 
+   if (mlx4_is_slave(to_mdev(ibdev)->dev))
+   return -EOPNOTSUPP;
+
spin_lock(&to_mdev(ibdev)->sm_lock);
memcpy(ibdev->node_desc, props->node_desc, 64);
spin_unlock(&to_mdev(ibdev)->sm_lock);
@@ -492,7 +495,7 @@ static int mlx4_ib_modify_device(struct ib_device *ibdev, 
int mask,
memset(mailbox->buf, 0, 256);
memcpy(mailbox->buf, props->node_desc, 64);
mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,
-MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
+MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 
mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox);
 
@@ -872,6 +875,7 @@ static int init_node_data(struct mlx4_ib_dev *dev)
if (err)
goto out;
 
+   dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
 
 out:
@@ -960,7 +964,7 @@ static void update_gids_task(struct work_struct *work)
 
err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | 
gw->port,
   1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
-  MLX4_CMD_NATIVE);
+  MLX4_CMD_WRAPPED);
if (err)
pr_warn("set port command failed\n");
else {
@@ -1350,10 +1354,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
ibdev->ib_dev.detach_mcast  = mlx4_ib_mcg_detach;
ibdev->ib_dev.process_mad   = mlx4_ib_process_mad;
 
-   ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc;
-   ibdev->ib_dev.map_phys_fmr  = mlx4_ib_map_phys_fmr;
-   ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr;
-   ibdev->ib_dev.dealloc_fmr   = mlx4_ib_fmr_dealloc;
+   if (!mlx4_is_slave(ibdev->dev)) {
+   ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc;
+   ibdev->ib_dev.map_phys_fmr  = mlx4_ib_map_phys_fmr;
+   ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr;
+   ibde

[PATCH for-next V1 07/29] IB/core: move macros from cm_msgs.h to ib_cm.h

2012-06-19 Thread Jack Morgenstein
These macros will be reused by the mlx4 SRIOV-IB CM
paravirtualization code, and there is no reason to have
them declared both in the IB core in the mlx4 IB driver.

Signed-off-by: Jack Morgenstein 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/cm_msgs.h |   12 
 include/rdma/ib_cm.h  |   12 
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/core/cm_msgs.h 
b/drivers/infiniband/core/cm_msgs.h
index 7da9b21..be068f4 100644
--- a/drivers/infiniband/core/cm_msgs.h
+++ b/drivers/infiniband/core/cm_msgs.h
@@ -44,18 +44,6 @@
 
 #define IB_CM_CLASS_VERSION2 /* IB specification 1.2 */
 
-#define CM_REQ_ATTR_ID cpu_to_be16(0x0010)
-#define CM_MRA_ATTR_ID cpu_to_be16(0x0011)
-#define CM_REJ_ATTR_ID cpu_to_be16(0x0012)
-#define CM_REP_ATTR_ID cpu_to_be16(0x0013)
-#define CM_RTU_ATTR_ID cpu_to_be16(0x0014)
-#define CM_DREQ_ATTR_IDcpu_to_be16(0x0015)
-#define CM_DREP_ATTR_IDcpu_to_be16(0x0016)
-#define CM_SIDR_REQ_ATTR_IDcpu_to_be16(0x0017)
-#define CM_SIDR_REP_ATTR_IDcpu_to_be16(0x0018)
-#define CM_LAP_ATTR_ID cpu_to_be16(0x0019)
-#define CM_APR_ATTR_ID cpu_to_be16(0x001A)
-
 enum cm_msg_sequence {
CM_MSG_SEQUENCE_REQ,
CM_MSG_SEQUENCE_LAP,
diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h
index 83f77ac..0e3ff30 100644
--- a/include/rdma/ib_cm.h
+++ b/include/rdma/ib_cm.h
@@ -262,6 +262,18 @@ struct ib_cm_event {
void*private_data;
 };
 
+#define CM_REQ_ATTR_ID cpu_to_be16(0x0010)
+#define CM_MRA_ATTR_ID cpu_to_be16(0x0011)
+#define CM_REJ_ATTR_ID cpu_to_be16(0x0012)
+#define CM_REP_ATTR_ID cpu_to_be16(0x0013)
+#define CM_RTU_ATTR_ID cpu_to_be16(0x0014)
+#define CM_DREQ_ATTR_IDcpu_to_be16(0x0015)
+#define CM_DREP_ATTR_IDcpu_to_be16(0x0016)
+#define CM_SIDR_REQ_ATTR_IDcpu_to_be16(0x0017)
+#define CM_SIDR_REP_ATTR_IDcpu_to_be16(0x0018)
+#define CM_LAP_ATTR_ID cpu_to_be16(0x0019)
+#define CM_APR_ATTR_ID cpu_to_be16(0x001A)
+
 /**
  * ib_cm_handler - User-defined callback to process communication events.
  * @cm_id: Communication identifier associated with the reported event.
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH for-next V1 25/29] net/mlx4_core: Adjustments to SET_PORT for SRIOV-IB

2012-06-19 Thread Jack Morgenstein
1. Slave may not set the IS_SM capability for the port.
2. No DEV_MGR in multifunc mode.

Signed-off-by: Jack Morgenstein 
---
 drivers/net/ethernet/mellanox/mlx4/port.c |   10 ++
 include/linux/mlx4/device.h   |5 +
 2 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c 
b/drivers/net/ethernet/mellanox/mlx4/port.c
index 90dc475..4194859 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -683,6 +683,16 @@ static int mlx4_common_set_port(struct mlx4_dev *dev, int 
slave, u32 in_mod,
new_cap_mask = ((__be32 *) inbox->buf)[1];
}
 
+   /* slave may not set the IS_SM capability for the port */
+   if (slave != mlx4_master_func_num(dev) &&
+   (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_IS_SM))
+   return -EINVAL;
+
+   /* NO DEV_MGR in multifunc mode */
+   if (mlx4_is_mfunc(dev) &&
+   (be32_to_cpu(new_cap_mask) & MLX4_PORT_CAP_DEV_MGMT_SUP))
+   return -EINVAL;
+
agg_cap_mask = 0;
slave_cap_mask =
priv->mfunc.master.slave_state[slave].ib_cap_mask[port];
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 05846a3..0db9946 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -53,6 +53,11 @@ enum {
 };
 
 enum {
+   MLX4_PORT_CAP_IS_SM = 1 << 1,
+   MLX4_PORT_CAP_DEV_MGMT_SUP = 1 << 19,
+};
+
+enum {
MLX4_MAX_PORTS  = 2,
MLX4_MAX_PORT_PKEYS = 128
 };
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH for-next V1 28/29] {NET,IB}/mlx4: Activate SRIOV mode for IB

2012-06-19 Thread Jack Morgenstein
Remove the error returns for IB ports from mlx4_ib_add,
mlx4_INIT_PORT_wrapper, and mlx4_CLOSE_PORT_wrapper.

Currently, SRIOV is supported only for devices for which the
link-layer is IB on all ports; RoCE support will be implemented at later time.

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/main.c   |8 ++--
 drivers/net/ethernet/mellanox/mlx4/fw.c |6 --
 include/linux/mlx4/device.h |4 
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index f072572..61b0db9 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1243,11 +1243,15 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 
pr_info_once("%s", mlx4_ib_version);
 
-   if (mlx4_is_mfunc(dev)) {
-   pr_warn("IB not yet supported in SRIOV\n");
+   mlx4_foreach_non_ib_transport_port(i, dev)
+   num_ports++;
+
+   if (mlx4_is_mfunc(dev) && num_ports) {
+   dev_err(&dev->pdev->dev, "RoCE is not supported over SRIOV as 
yet\n");
return NULL;
}
 
+   num_ports = 0;
mlx4_foreach_ib_transport_port(i, dev)
num_ports++;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c 
b/drivers/net/ethernet/mellanox/mlx4/fw.c
index d72595f..3a73891 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -1342,9 +1342,6 @@ int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int 
slave,
if (priv->mfunc.master.slave_state[slave].init_port_mask & (1 << port))
return 0;
 
-   if (dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB)
-   return -ENODEV;
-
if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
/* Enable port only if it was previously disabled */
if (!priv->mfunc.master.init_port_ref[port]) {
@@ -1440,9 +1437,6 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int 
slave,
(1 << port)))
return 0;
 
-   if (dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB)
-   return -ENODEV;
-
if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
if (priv->mfunc.master.init_port_ref[port] == 1) {
err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 0db9946..c853ef5 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -662,6 +662,10 @@ struct mlx4_init_port_param {
for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \
if ((type) == (dev)->caps.port_mask[(port)])
 
+#define mlx4_foreach_non_ib_transport_port(port, dev) \
+   for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)   \
+   if (((dev)->caps.port_mask[port] != MLX4_PORT_TYPE_IB))
+
 #define mlx4_foreach_ib_transport_port(port, dev) \
for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++)   \
if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH for-next V1 29/29] {NET,IB}/mlx4: Paravirtualize Node Guids for slaves.

2012-06-19 Thread Jack Morgenstein
This is necessary in order to support more than one VF/PF on a VM for software
that uses the node guid as a discriminator, such as librdmacm.

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/mad.c  |   14 ++
 drivers/infiniband/hw/mlx4/mlx4_ib.h  |3 +++
 drivers/net/ethernet/mellanox/mlx4/cmd.c  |   11 +++
 drivers/net/ethernet/mellanox/mlx4/main.c |   22 ++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |1 +
 include/linux/mlx4/device.h   |2 ++
 6 files changed, 53 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 6a485d0..e01309b 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -89,6 +90,12 @@ static void handle_lid_change_event(struct mlx4_ib_dev *dev, 
u8 port_num);
 static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
int block, u32 change_bitmap);
 
+__be64 mlx4_ib_gen_node_guid(void)
+{
+#define NODE_GUID_HI   ((u64) (((u64)IB_OPENIB_OUI) << 40))
+   return (cpu_to_be64( NODE_GUID_HI | random32()));
+}
+
 __be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
 {
return cpu_to_be64(atomic_inc_return(&ctx->tid)) |
@@ -1959,6 +1966,13 @@ int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev)
return 0;
}
 
+   for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+   if (i == mlx4_master_func_num(dev->dev))
+   mlx4_put_slave_node_guid(dev->dev, i, 
dev->ib_dev.node_guid);
+   else
+   mlx4_put_slave_node_guid(dev->dev, i, 
mlx4_ib_gen_node_guid());
+   }
+
err = mlx4_ib_init_alias_guid_service(dev);
if (err) {
mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n");
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h 
b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 5e5fc70..68b1cd5 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -722,4 +722,7 @@ int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev 
*device) ;
 
 void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device);
 
+__be64 mlx4_ib_gen_node_guid(void);
+
+
 #endif /* MLX4_IB_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 0629341..621bcd0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -651,6 +651,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int 
slave,
struct ib_smp *outsmp = outbox->buf;
__be16 *outtab = (__be16 *)(outsmp->data);
__be32 slave_cap_mask;
+   __be64 slave_node_guid;
port = vhcr->in_modifier;
 
if (smp->base_version == 1 &&
@@ -710,6 +711,16 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int 
slave,
}
return err;
}
+   if (smp->attr_id == IB_SMP_ATTR_NODE_INFO) {
+   err = mlx4_cmd_box(dev, inbox->dma, outbox->dma,
+vhcr->in_modifier, 
vhcr->op_modifier,
+vhcr->op, MLX4_CMD_TIME_CLASS_C, 
MLX4_CMD_NATIVE);
+   if (!err) {
+   slave_node_guid =  
mlx4_get_slave_node_guid(dev, slave);
+   memcpy(outsmp->data + 12, 
&slave_node_guid, 8);
+   }
+   return err;
+   }
}
}
if (slave != mlx4_master_func_num(dev) &&
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c 
b/drivers/net/ethernet/mellanox/mlx4/main.c
index c54003b..398c3f4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -414,6 +414,28 @@ void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, 
int port, int i, int
 }
 EXPORT_SYMBOL(mlx4_sync_pkey_table);
 
+void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid)
+{
+   struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+   if (!mlx4_is_master(dev))
+   return;
+
+   priv->slave_node_guids[slave] = guid;
+}
+EXPORT_SYMBOL(mlx4_put_slave_node_guid);
+
+__be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave)
+{
+   struct mlx4_priv *priv = container_of(dev, struct mlx4_priv, dev);
+
+   if (!mlx4_is_master(dev))
+   return 0;
+
+   return priv->slave_node_guids[slave];
+}
+EXPORT_SYMBOL(mlx4_get_slave_node_guid);
+
 int mlx4_is_slave_active

[PATCH for-next V1 26/29] net/mlx4_core: INIT/CLOSE port logic for IB ports in SRIOV mode

2012-06-19 Thread Jack Morgenstein
Normally, INIT_PORT and CLOSE_PORT are invoked when special QP0
transitions to RTR, or transitions to ERR/RESET respectively.

In SRIOV mode, however, the master is also paravirtualized.
This in turn requires that we not do INIT_PORT until the
entire QP0 path (real QP0 and proxy QP0) is ready to receive.
When the real QP0 goes down, we indicate that the port is not active.

Signed-off-by: Jack Morgenstein 
---
 drivers/net/ethernet/mellanox/mlx4/fw.c |   71 +--
 drivers/net/ethernet/mellanox/mlx4/qp.c |   38 +++--
 2 files changed, 92 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c 
b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 5d19750..d72595f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -1316,6 +1316,19 @@ out:
return err;
 }
 
+/* for IB-type ports only in SRIOV mode. Checks that both proxy QP0
+ * and real QP0 are active, so that the paravirtualized QP0 is ready
+ * to operate */
+static int check_qp0_state(struct mlx4_dev *dev, int function, int port)
+{
+   struct mlx4_priv *priv = mlx4_priv(dev);
+   /* irrelevant if not infiniband */
+   if (priv->mfunc.master.qp0_state[port].proxy_qp0_active &&
+   priv->mfunc.master.qp0_state[port].qp0_active)
+   return 1;
+   return 0;
+}
+
 int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int slave,
   struct mlx4_vhcr *vhcr,
   struct mlx4_cmd_mailbox *inbox,
@@ -1332,14 +1345,29 @@ int mlx4_INIT_PORT_wrapper(struct mlx4_dev *dev, int 
slave,
if (dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB)
return -ENODEV;
 
-   /* Enable port only if it was previously disabled */
-   if (!priv->mfunc.master.init_port_ref[port]) {
-   err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
-  MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
-   if (err)
-   return err;
+   if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
+   /* Enable port only if it was previously disabled */
+   if (!priv->mfunc.master.init_port_ref[port]) {
+   err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_INIT_PORT,
+  MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+   if (err)
+   return err;
+   }
+   priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << 
port);
+   } else {
+   if (slave == mlx4_master_func_num(dev)) {
+   if (check_qp0_state(dev, slave, port) &&
+   !priv->mfunc.master.qp0_state[port].port_active) {
+   err = mlx4_cmd(dev, 0, port, 0, 
MLX4_CMD_INIT_PORT,
+  MLX4_CMD_TIME_CLASS_A, 
MLX4_CMD_NATIVE);
+   if (err)
+   return err;
+   priv->mfunc.master.qp0_state[port].port_active 
= 1;
+   
priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
+   }
+   } else
+   priv->mfunc.master.slave_state[slave].init_port_mask |= 
(1 << port);
}
-   priv->mfunc.master.slave_state[slave].init_port_mask |= (1 << port);
++priv->mfunc.master.init_port_ref[port];
return 0;
 }
@@ -1414,13 +1442,30 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int 
slave,
 
if (dev->caps.port_mask[port] == MLX4_PORT_TYPE_IB)
return -ENODEV;
-   if (priv->mfunc.master.init_port_ref[port] == 1) {
-   err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000,
-  MLX4_CMD_NATIVE);
-   if (err)
-   return err;
+
+   if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
+   if (priv->mfunc.master.init_port_ref[port] == 1) {
+   err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+  1000, MLX4_CMD_NATIVE);
+   if (err)
+   return err;
+   }
+   priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << 
port);
+   } else {
+   /* infiniband port */
+   if (slave == mlx4_master_func_num(dev)) {
+   if (!priv->mfunc.master.qp0_state[port].qp0_active &&
+   priv->mfunc.master.qp0_state[port].port_active) {
+   err = mlx4_cmd(dev, 0, port, 0, 
MLX4_CMD_CLOSE_PORT,
+  1000, MLX4_CMD_NATIVE);
+   

[PATCH for-next V1 22/29] {NET,IB}/mlx4: Add alias_guid mechanism

2012-06-19 Thread Jack Morgenstein
For IB ports, we paravirtualize the GUID at index 0 on slaves.
The GUID at index 0 seen by a slave is the actual GUID occupying
the GUID table at the slave-id index.

The driver, by default, request at startup time that subnet manager
populate its entire guid table with GUIDs. These guids are then
mapped (paravirtualized) to the slaves, and appear for each slave
as its GUID at index 0.

Until each slave has such a guid, its port status is DOWN.

The guid table is cached to support SQP paravirtualization, and event
propagation to slaves on guid change (we test to see if the guid really
changed before propagating an event to the slave).

To support this caching, added capability to __mlx4_ib_query_gid()
to obtain the network view (i.e., physical view) gid at index X,
not just the host (paravirtualized) view.

Based on a patch from Erez Shitrit 

Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/Makefile  |2 +-
 drivers/infiniband/hw/mlx4/alias_GUID.c  |  688 ++
 drivers/infiniband/hw/mlx4/mad.c |   19 +-
 drivers/infiniband/hw/mlx4/main.c|   37 ++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   74 
 drivers/net/ethernet/mellanox/mlx4/cmd.c |6 +-
 6 files changed, 816 insertions(+), 10 deletions(-)
 create mode 100644 drivers/infiniband/hw/mlx4/alias_GUID.c

diff --git a/drivers/infiniband/hw/mlx4/Makefile 
b/drivers/infiniband/hw/mlx4/Makefile
index bf0aa90..31d4c8a 100644
--- a/drivers/infiniband/hw/mlx4/Makefile
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_MLX4_INFINIBAND)  += mlx4_ib.o
 
-mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o
+mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o 
alias_GUID.o
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c 
b/drivers/infiniband/hw/mlx4/alias_GUID.c
new file mode 100644
index 000..9db1581
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+ /***/
+/*This file support the handling of the Alias GUID feature. */
+/***/
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "mlx4_ib.h"
+
+/*
+The driver keeps the current state of all guids, as they are in the HW.
+Whenever we receive an smp mad GUIDInfo record, the data will be cached.
+*/
+
+struct mlx4_alias_guid_work_context {
+   u8 port;
+   struct mlx4_ib_dev *dev ;
+   struct ib_sa_query *sa_query;
+   struct completion   done;
+   int query_id;
+   struct list_headlist;
+   int block_num;
+};
+
+struct mlx4_next_alias_guid_work {
+   u8 port;
+   u8 block_num;
+   struct mlx4_sriov_alias_guid_info_rec_det rec_det;
+};
+
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int 
block_num,
+u8 port_num, u8 *p_data)
+{
+   int i;
+   u64 guid_indexes;
+   int slave_id;
+   int port_index = port_num - 1;
+
+   if (!mlx4_is_master(dev->dev))
+   return;
+
+   guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+  ports_guid[port_num - 1].
+  all_rec_per_port[block_num].guid_indexes);
+   pr_debug("p

[PATCH for-next V1 21/29] net/mlx4_core: Add IB port-state machine, and port mgmt event propagation infrastructure

2012-06-19 Thread Jack Morgenstein
For an IB port, a slave should not show port active until that
slave has a valid alias-guid (provided by the subnet manager).
Therefore the port-up event should be passed to a slave only after
both the port is up, and the slave's alias-guid has been set.

Also, provide the infrastructure for propagating port-management
events (client-reregister, etc) to slaves.

Signed-off-by: Jack Morgenstein 
---
 drivers/net/ethernet/mellanox/mlx4/eq.c   |  237 +++--
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |1 +
 include/linux/mlx4/device.h   |   28 
 3 files changed, 250 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c 
b/drivers/net/ethernet/mellanox/mlx4/eq.c
index 9b15d02..5d1de71 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -199,6 +199,196 @@ static void mlx4_slave_event(struct mlx4_dev *dev, int 
slave,
slave_event(dev, slave, eqe);
 }
 
+int mlx4_gen_pkey_eqe(struct mlx4_dev *dev, int slave, u8 port)
+{
+   struct mlx4_eqe eqe;
+
+   struct mlx4_priv *priv = mlx4_priv(dev);
+   struct mlx4_slave_state *s_slave = 
&priv->mfunc.master.slave_state[slave];
+
+   if (!s_slave->active)
+   return 0;
+
+   memset(&eqe, 0, sizeof eqe);
+
+   eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+   eqe.subtype = MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE;
+   eqe.event.port_mgmt_change.port = port;
+
+   return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_pkey_eqe);
+
+int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port)
+{
+   struct mlx4_eqe eqe;
+
+   /*don't send if we don't have the that slave */
+   if (dev->num_vfs < slave)
+   return 0;
+   memset(&eqe, 0, sizeof eqe);
+
+   eqe.type = MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT;
+   eqe.subtype = MLX4_DEV_PMC_SUBTYPE_GUID_INFO;
+   eqe.event.port_mgmt_change.port = port;
+
+   return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_guid_change_eqe);
+
+int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port,
+  u8 port_subtype_change)
+{
+   struct mlx4_eqe eqe;
+
+   /*don't send if we don't have the that slave */
+   if (dev->num_vfs < slave)
+   return 0;
+   memset(&eqe, 0, sizeof eqe);
+
+   eqe.type = MLX4_EVENT_TYPE_PORT_CHANGE;
+   eqe.subtype = port_subtype_change;
+   eqe.event.port_change.port = cpu_to_be32(port << 28);
+
+   mlx4_dbg(dev, "%s: sending: %d to slave: %d on port: %d\n", __func__,
+port_subtype_change, slave, port);
+   return mlx4_GEN_EQE(dev, slave, &eqe);
+}
+EXPORT_SYMBOL(mlx4_gen_port_state_change_eqe);
+
+enum slave_port_state mlx4_get_slave_port_state(struct mlx4_dev *dev, int 
slave, u8 port)
+{
+   struct mlx4_priv *priv = mlx4_priv(dev);
+   struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
+   if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS) {
+   pr_err("%s: Error: asking for slave:%d, port:%d\n",
+  __func__, slave, port);
+   return SLAVE_PORT_DOWN;
+   }
+   return s_state[slave].port_state[port];
+}
+EXPORT_SYMBOL(mlx4_get_slave_port_state);
+
+static int mlx4_set_slave_port_state(struct mlx4_dev *dev, int slave, u8 port,
+enum slave_port_state state)
+{
+   struct mlx4_priv *priv = mlx4_priv(dev);
+   struct mlx4_slave_state *s_state = priv->mfunc.master.slave_state;
+
+   if (slave >= dev->num_slaves || port > MLX4_MAX_PORTS || port == 0) {
+   pr_err("%s: Error: asking for slave:%d, port:%d\n",
+  __func__, slave, port);
+   return -1;
+   }
+   s_state[slave].port_state[port] = state;
+
+   return 0;
+}
+
+static void set_all_slave_state(struct mlx4_dev *dev, u8 port, int event)
+{
+   int i;
+   enum slave_port_gen_event gen_event;
+
+   for (i = 0; i < dev->num_slaves; i++)
+   set_and_calc_slave_port_state(dev, i, port, event, &gen_event);
+}
+/**
+   The function get as input the new event to that port,
+   and according to the prev state change the slave's port state.
+   The events are:
+   MLX4_PORT_STATE_DEV_EVENT_PORT_DOWN,
+   MLX4_PORT_STATE_DEV_EVENT_PORT_UP
+   MLX4_PORT_STATE_IB_EVENT_GID_VALID
+   MLX4_PORT_STATE_IB_EVENT_GID_INVALID
+***/
+int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave,
+ u8 port, int event,
+

[PATCH for-next V1 20/29] IB/mlx4: Add CM paravirtualization

2012-06-19 Thread Jack Morgenstein
From: Amir Vadai 

In CM para-virtualization:
1. Incoming requests are steered to the correct vHCA according to the embedded 
GID
2. Communication IDs on outgoing requests are replaced by a globally unique ID, 
generated
   by the PPF, since there is no synchronization of ID generation between 
guests (and so these
   IDs are not guaranteed to be globally unique).
   The guest's comm ID is stored, and is returned to the response MAD when it 
arrives.

Signed-off-by: Amir Vadai 
Signed-off-by: Jack Morgenstein 
---
 drivers/infiniband/hw/mlx4/Makefile  |2 +-
 drivers/infiniband/hw/mlx4/cm.c  |  437 ++
 drivers/infiniband/hw/mlx4/mad.c |   16 ++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   15 ++
 4 files changed, 468 insertions(+), 2 deletions(-)
 create mode 100644 drivers/infiniband/hw/mlx4/cm.c

diff --git a/drivers/infiniband/hw/mlx4/Makefile 
b/drivers/infiniband/hw/mlx4/Makefile
index 20d627d..bf0aa90 100644
--- a/drivers/infiniband/hw/mlx4/Makefile
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_MLX4_INFINIBAND)  += mlx4_ib.o
 
-mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o
+mlx4_ib-y :=   ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o
diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
new file mode 100644
index 000..0a52146
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include 
+
+#include 
+#include 
+#include 
+#include 
+
+#include "mlx4_ib.h"
+
+#define CM_CLEANUP_CACHE_TIMEOUT  (5 * HZ)
+
+struct id_map_entry {
+   struct rb_node node;
+
+   u32 sl_cm_id;
+   u32 pv_cm_id;
+   int slave_id;
+   int scheduled_delete;
+   struct mlx4_ib_dev *dev;
+
+   struct list_head list;
+   struct delayed_work timeout;
+};
+
+struct cm_generic_msg {
+   struct ib_mad_hdr hdr;
+
+   __be32 local_comm_id;
+   __be32 remote_comm_id;
+};
+
+struct cm_req_msg {
+   unsigned char unused[0x60];
+   union ib_gid primary_path_sgid;
+};
+
+
+static void set_local_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+   struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+   msg->local_comm_id = cpu_to_be32(cm_id);
+}
+
+static u32 get_local_comm_id(struct ib_mad *mad)
+{
+   struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+
+   return be32_to_cpu(msg->local_comm_id);
+}
+
+static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+   struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+   msg->remote_comm_id = cpu_to_be32(cm_id);
+}
+
+static u32 get_remote_comm_id(struct ib_mad *mad)
+{
+   struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+
+   return be32_to_cpu(msg->remote_comm_id);
+}
+
+static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad 
*mad)
+{
+   struct cm_req_msg *msg = (struct cm_req_msg *)mad;
+
+   return msg->primary_path_sgid;
+}
+
+/* Lock should be taken before called */
+static struct id_map_entry *
+id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id)
+{
+   struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+   struct rb_node *node = sl_id_map->rb_node;
+
+   while (node) {
+   struct id_map_entry *id_map_entry =
+   rb_entry(node, struct id_map_entry, node);
+
+   if (id_map_entry->sl_cm_id > sl_cm_id)
+ 

  1   2   3   >