from:"Erez Shitrit"

Re: [PATCH] IB/IPoIB: Fix kernel panic on multicast flow

2016-01-07 Thread Erez Shitrit

On Thu, Jan 7, 2016 at 11:23 AM, Yuval Shaia <yuval.sh...@oracle.com> wrote:
> On Thu, Jan 07, 2016 at 09:28:08AM +0200, Erez Shitrit wrote:
>>
>> ipoib_mcast_restart_task calls ipoib_mcast_remove_list with the
>> parameter mcast->dev. That mcast is a temporary (used as an iterator)
>> variable that may be uninitialized.
>> There is no need to send the variable dev to the function, as each mcast
> s/send/pass
>> has its dev as a member in the mcast struct.
>>
>> This causes the next panic:
>> RIP: 0010: ipoib_mcast_leave+0x6d/0xf0 [ib_ipoib]
>> RSP: 0018: EFLAGS: 00010246
>> RAX: f0201 RBX: 24e00 RCX: 0
>> 
>> 
>> Stack:
>> Call Trace:
>>   ipoib_mcast_remove_list+0x3a/0x70 [ib_ipoib]
>>   ipoib_mcast_restart_task+0x3bb/0x520 [ib_ipoib]
>>   process_one_work+0x164/0x470
>>   worker_thread+0x11d/0x420
>>   ...
>>
>> Fixes: 5a0e81f6f483 ('IB/IPoIB: factor out common multicast list removal 
>> code')
>> Signed-off-by: Erez Shitrit <ere...@mellanox.com>
>> Reported-by: Doron Tsur <dor...@mellanox.com>
>> ---
>>  drivers/infiniband/ulp/ipoib/ipoib.h   | 2 +-
>>  drivers/infiniband/ulp/ipoib/ipoib_main.c  | 3 +--
>>  drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 8 
>>  3 files changed, 6 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
>> b/drivers/infiniband/ulp/ipoib/ipoib.h
>> index a924933..a6f3eab 100644
>> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
>> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
>> @@ -548,7 +548,7 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter,
>>
>>  int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
>>  union ib_gid *mgid, int set_qkey);
>> -void ipoib_mcast_remove_list(struct net_device *dev, struct list_head 
>> *remove_list);
>> +void ipoib_mcast_remove_list(struct list_head *remove_list);
>>  void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 
>> *mgid,
>>   struct list_head *remove_list);
>>
>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
>> b/drivers/infiniband/ulp/ipoib/ipoib_main.c
>> index 0a93cb2..25509bb 100644
>> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
>> @@ -1150,7 +1150,6 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv 
>> *priv)
>>   unsigned long flags;
>>   int i;
>>   LIST_HEAD(remove_list);
>> - struct net_device *dev = priv->dev;
> Isn't this one needed later? (in call to __ipoib_mcast_find made in commit
> bd99b2e05c)

There is a later commit (432c55ff) that already takes that code.

>>
>>   if (test_bit(IPOIB_STOP_NEIGH_GC, >flags))
>>   return;
>> @@ -1196,7 +1195,7 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv 
>> *priv)
>>
>>  out_unlock:
>>   spin_unlock_irqrestore(>lock, flags);
>> - ipoib_mcast_remove_list(dev, _list);
>> + ipoib_mcast_remove_list(_list);
> I'm having difficulties applying this patch on linux-stable.git 4.4-rc6 and
> linux-next 4.4-rc8.

I sent it against ib-next

>>  }
>>
>>  static void ipoib_reap_neigh(struct work_struct *work)
>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 
>> b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
>> index ab79b87..050dfa1 100644
>> --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
>> @@ -723,12 +723,12 @@ void ipoib_check_and_add_mcast_sendonly(struct 
>> ipoib_dev_priv *priv, u8 *mgid,
>>   }
>>  }
>>
>> -void ipoib_mcast_remove_list(struct net_device *dev, struct list_head 
>> *remove_list)
>> +void ipoib_mcast_remove_list(struct list_head *remove_list)
>>  {
>>   struct ipoib_mcast *mcast, *tmcast;
>>
>>   list_for_each_entry_safe(mcast, tmcast, remove_list, list) {
>> - ipoib_mcast_leave(dev, mcast);
>> + ipoib_mcast_leave(mcast->dev, mcast);
>>   ipoib_mcast_free(mcast);
>>   }
>>  }
>> @@ -839,7 +839,7 @@ void ipoib_mcast_dev_flush(struct net_device *dev)
>>   if (test_bit(IPOIB_MCAST_FLAG_BUSY, >flags))
>>   wait_for_completion(>done);
>>
>> - ipoib_mcast_remove_list(dev, _list);
>> + ipoib_mcast_remove_list(_list);
>>  }
>>
>>  static int ipoib_mcast_addr_is_valid(const u8 *a

Re: [PATCH] IB/cm: Fix a recently introduced deadlock

2016-01-06 Thread Erez Shitrit

On Fri, Jan 1, 2016 at 2:17 PM, Bart Van Assche
<bart.vanass...@sandisk.com> wrote:
> ib_send_cm_drep() calls cm_enter_timewait() while holding a spinlock
> that can be locked from inside an interrupt handler. Hence do not
> enable interrupts inside cm_enter_timewait() if called with interrupts
> disabled.
>
> This patch fixes e.g. the following deadlock:
>
> =
> [ INFO: inconsistent lock state ]
> 4.4.0-rc7+ #1 Tainted: GE
> -
> inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
> swapper/8/0 [HC1[1]:SC0[0]:HE0:SE1] takes:
> (&(_id_priv->lock)->rlock){?.+...}, at: [] 
> cm_establish+0x
> 74/0x1b0 [ib_cm]
> {HARDIRQ-ON-W} state was registered at:
>   [] mark_held_locks+0x71/0x90
>   [] trace_hardirqs_on_caller+0xa7/0x1c0
>   [] trace_hardirqs_on+0xd/0x10
>   [] _raw_spin_unlock_irq+0x2b/0x40
>   [] cm_enter_timewait+0xae/0x100 [ib_cm]
>   [] ib_send_cm_drep+0xb6/0x190 [ib_cm]
>   [] srp_cm_handler+0x128/0x1a0 [ib_srp]
>   [] cm_process_work+0x20/0xf0 [ib_cm]
>   [] cm_dreq_handler+0x135/0x2c0 [ib_cm]
>   [] cm_work_handler+0x75/0xd0 [ib_cm]
>   [] process_one_work+0x1bd/0x460
>   [] worker_thread+0x118/0x420
>   [] kthread+0xe4/0x100
>   [] ret_from_fork+0x3f/0x70
> irq event stamp: 1672286
> hardirqs last  enabled at (1672283): [] poll_idle+0x10/0x80
> hardirqs last disabled at (1672284): [] 
> common_interrupt+0x84/0x89
> softirqs last  enabled at (1672286): [] 
> _local_bh_enable+0x1c/0x50
> softirqs last disabled at (1672285): [] irq_enter+0x47/0x70
>
> other info that might help us debug this:
>  Possible unsafe locking scenario:
>
>CPU0
>
>   lock(&(_id_priv->lock)->rlock);
>   
> lock(&(_id_priv->lock)->rlock);
>
>  *** DEADLOCK ***
>
> no locks held by swapper/8/0.
>
> stack backtrace:
> CPU: 8 PID: 0 Comm: swapper/8 Tainted: GE   4.4.0-rc7+ #1
> Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 11/17/2014
>  88045af5e950 88046e503a88 81251c1b 0007
>  0006 0003 88045af5ddc0 88046e503ad8
>  810a32f4   0001
> Call Trace:
>[] dump_stack+0x4f/0x74
>  [] print_usage_bug+0x184/0x190
>  [] mark_lock_irq+0xf2/0x290
>  [] mark_lock+0x115/0x1b0
>  [] mark_irqflags+0x15c/0x170
>  [] __lock_acquire+0x1ef/0x560
>  [] lock_acquire+0x62/0x80
>  [] _raw_spin_lock_irqsave+0x43/0x60
>  [] cm_establish+0x74/0x1b0 [ib_cm]
>  [] ib_cm_notify+0x31/0x100 [ib_cm]
>  [] srpt_qp_event+0x54/0xd0 [ib_srpt]
>  [] mlx4_ib_qp_event+0x72/0xc0 [mlx4_ib]
>  [] mlx4_qp_event+0x69/0xd0 [mlx4_core]
>  [] mlx4_eq_int+0x51e/0xd50 [mlx4_core]
>  [] mlx4_msi_x_interrupt+0xf/0x20 [mlx4_core]
>  [] handle_irq_event_percpu+0x40/0x110
>  [] handle_irq_event+0x3f/0x70
>  [] handle_edge_irq+0x79/0x120
>  [] handle_irq+0x5d/0x130
>  [] do_IRQ+0x6d/0x130
>  [] common_interrupt+0x89/0x89
>[] cpuidle_enter_state+0xcf/0x200
>  [] cpuidle_enter+0x12/0x20
>  [] call_cpuidle+0x36/0x60
>  [] cpuidle_idle_call+0x63/0x110
>  [] cpu_idle_loop+0xfa/0x130
>  [] cpu_startup_entry+0xe/0x10
>  [] start_secondary+0x83/0x90
>
> Fixes: commit be4b499323bf ("IB/cm: Do not queue work to a device that's 
> going away")
> Signed-off-by: Bart Van Assche <bart.vanass...@sandisk.com>

Acked-by: Erez Shitrit <ere...@mellanox.com>

> Cc: Erez Shitrit <ere...@mellanox.com>
> Cc: stable <sta...@vger.kernel.org>
> ---
>  drivers/infiniband/core/cm.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
> index 0a26dd6..d6d2b35 100644
> --- a/drivers/infiniband/core/cm.c
> +++ b/drivers/infiniband/core/cm.c
> @@ -782,11 +782,11 @@ static void cm_enter_timewait(struct cm_id_private 
> *cm_id_priv)
> wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
>
> /* Check if the device started its remove_one */
> -   spin_lock_irq();
> +   spin_lock_irqsave(, flags);
> if (!cm_dev->going_down)
> queue_delayed_work(cm.wq, 
> _id_priv->timewait_info->work.work,
>msecs_to_jiffies(wait_time));
> -   spin_unlock_irq();
> +   spin_unlock_irqrestore(, flags);
>
> cm_id_priv->timewait_info = NULL;
>  }
> --
> 2.1.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] IB/ipoib: Change sendq threshold size

2015-11-16 Thread Erez Shitrit

On Mon, Nov 16, 2015 at 8:54 PM, Yuval Shaia  wrote:
> Expecting half of the queue to be empty before reopening netif_queue seems
> too high.
> With this fix threshold will be 90%.

Is this backed by tests? why not 75% or 25%?

The origin reason for keeping the queue closed till half of the
completions returned was according to the the reason that the HW in
some cases doesn't keep pace with heavy TX flow, so it was good to
stop the kernel from sending till the HW cleans all the waiting
completions.
If the trash hold will be high (90%) we will see many close/open of
the queue under long flow of traffic, which can lead to more back
pressure to the kernel and to lower bw or higher latency at the end.

(If you have tests that show the opposite, please share.)

>
> Suggested-By: Ajaykumar Hotchandani 
> Signed-off-by: Yuval Shaia 
> ---
>  drivers/infiniband/ulp/ipoib/ipoib.h  |4 
>  drivers/infiniband/ulp/ipoib/ipoib_cm.c   |8 
>  drivers/infiniband/ulp/ipoib/ipoib_ib.c   |4 ++--
>  drivers/infiniband/ulp/ipoib/ipoib_main.c |5 +
>  4 files changed, 15 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
> b/drivers/infiniband/ulp/ipoib/ipoib.h
> index edc5b85..9dd97ac 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> @@ -115,6 +115,9 @@ enum {
> IPOIB_RTNL_CHILD  = 2,
>  };
>
> +/* Wake queue in case tx_outstanding go below 90% of sendq size */
> +#define IPOIB_TX_RING_LOW_TH_FACTOR90
> +
>  #defineIPOIB_OP_RECV   (1ul << 31)
>  #ifdef CONFIG_INFINIBAND_IPOIB_CM
>  #defineIPOIB_OP_CM (1ul << 30)
> @@ -767,6 +770,7 @@ static inline void ipoib_unregister_debugfs(void) { }
> ipoib_printk(KERN_WARNING, priv, format , ## arg)
>
>  extern int ipoib_sendq_size;
> +extern int ipoib_sendq_low_th;
>  extern int ipoib_recvq_size;
>
>  extern struct ib_sa_client ipoib_sa_client;
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
> b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> index c78dc16..446f394 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> @@ -796,9 +796,9 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct 
> ib_wc *wc)
> netif_tx_lock(dev);
>
> ++tx->tx_tail;
> -   if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
> +   if (unlikely(--priv->tx_outstanding == ipoib_sendq_low_th) &&
> netif_queue_stopped(dev) &&
> -   test_bit(IPOIB_FLAG_ADMIN_UP, >flags))
> +   likely(test_bit(IPOIB_FLAG_ADMIN_UP, >flags)))
> netif_wake_queue(dev);
>
> if (wc->status != IB_WC_SUCCESS &&
> @@ -1198,9 +1198,9 @@ timeout:
> dev_kfree_skb_any(tx_req->skb);
> ++p->tx_tail;
> netif_tx_lock_bh(p->dev);
> -   if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) 
> &&
> +   if (unlikely(--priv->tx_outstanding == ipoib_sendq_low_th) &&
> netif_queue_stopped(p->dev) &&
> -   test_bit(IPOIB_FLAG_ADMIN_UP, >flags))
> +   likely(test_bit(IPOIB_FLAG_ADMIN_UP, >flags)))
> netif_wake_queue(p->dev);
> netif_tx_unlock_bh(p->dev);
> }
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
> b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> index d27..6f12009 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> @@ -397,9 +397,9 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, 
> struct ib_wc *wc)
> dev_kfree_skb_any(tx_req->skb);
>
> ++priv->tx_tail;
> -   if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
> +   if (unlikely(--priv->tx_outstanding == ipoib_sendq_low_th) &&
> netif_queue_stopped(dev) &&
> -   test_bit(IPOIB_FLAG_ADMIN_UP, >flags))
> +   likely(test_bit(IPOIB_FLAG_ADMIN_UP, >flags)))
> netif_wake_queue(dev);
>
> if (wc->status != IB_WC_SUCCESS &&
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
> b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> index babba05..8f2f8fc 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -62,6 +62,7 @@ MODULE_LICENSE("Dual BSD/GPL");
>  MODULE_VERSION(DRV_VERSION);
>
>  int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
> +int ipoib_sendq_low_th __read_mostly;
>  int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
>
>  module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
> @@ -2001,6 +2002,10 @@ static int __init ipoib_init_module(void)
> ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
> ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
>

Re: [PATCH] IB/ipoib: optimized the function ipoib_mcast_alloc

2015-11-02 Thread Erez Shitrit

On Sun, Nov 1, 2015 at 8:23 AM, Saurabh Sengar <saurabh.tr...@gmail.com> wrote:
> ipoib_mcast_alloc is called only in atomic context,
> hence removing the extra check.
>
> Signed-off-by: Saurabh Sengar <saurabh.tr...@gmail.com>

Acked-by: Erez Shitrit <ere...@mellanox.com>

> ---
> Hi,
> Even if in future, if there are some functions expected to call it in normal 
> context(not atomic),
> we can pass the GFP_KERNEL or GFP_ATOMIC directly to function call instead of 
> passing 0 and 1,
> which later again need to be compared in order to change it to GFP_KERNEL and 
> GFP_ATOMIC.
> Please let me know if there are better ways to improve it.
>
>  drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 11 +--
>  1 file changed, 5 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 
> b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> index d750a86..15d35be 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> @@ -132,12 +132,11 @@ void ipoib_mcast_free(struct ipoib_mcast *mcast)
> kfree(mcast);
>  }
>
> -static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
> -int can_sleep)
> +static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev)
>  {
> struct ipoib_mcast *mcast;
>
> -   mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC);
> +   mcast = kzalloc(sizeof *mcast, GFP_ATOMIC);
> if (!mcast)
> return NULL;
>
> @@ -573,7 +572,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
> if (!priv->broadcast) {
> struct ipoib_mcast *broadcast;
>
> -   broadcast = ipoib_mcast_alloc(dev, 0);
> +   broadcast = ipoib_mcast_alloc(dev);
> if (!broadcast) {
> ipoib_warn(priv, "failed to allocate broadcast 
> group\n");
> /*
> @@ -728,7 +727,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, 
> struct sk_buff *skb)
> ipoib_dbg_mcast(priv, "setting up send only multicast 
> group for %pI6\n",
> mgid);
>
> -   mcast = ipoib_mcast_alloc(dev, 0);
> +   mcast = ipoib_mcast_alloc(dev);
> if (!mcast) {
> ipoib_warn(priv, "unable to allocate memory "
>"for multicast structure\n");
> @@ -886,7 +885,7 @@ void ipoib_mcast_restart_task(struct work_struct *work)
> ipoib_dbg_mcast(priv, "adding multicast entry for 
> mgid %pI6\n",
> mgid.raw);
>
> -   nmcast = ipoib_mcast_alloc(dev, 0);
> +   nmcast = ipoib_mcast_alloc(dev);
> if (!nmcast) {
> ipoib_warn(priv, "unable to allocate memory 
> for multicast structure\n");
> continue;
> --
> 1.9.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] IB/cm: Do not queue a work when the device is going to be removed

2015-06-26 Thread Erez Shitrit

On Thu, Jun 25, 2015 at 6:51 PM, Bart Van Assche
bart.vanass...@sandisk.com wrote:
 On 06/25/2015 07:13 AM, Erez Shitrit wrote:

 @@ -3864,14 +3904,23 @@ static void cm_remove_one(struct ib_device
 *ib_device)
 list_del(cm_dev-list);
 write_unlock_irqrestore(cm.device_lock, flags);

 +   spin_lock_irq(cm.lock);
 +   cm_dev-going_down = 1;
 +   spin_unlock_irq(cm.lock);
 +
 for (i = 1; i = ib_device-phys_port_cnt; i++) {
 if (!rdma_cap_ib_cm(ib_device, i))
 continue;

 port = cm_dev-port[i-1];
 ib_modify_port(ib_device, port-port_num, 0,
 port_modify);
 -   ib_unregister_mad_agent(port-mad_agent);
 +   /*
 +* We flush the queue here after the going_down set, this
 +* verify that no new works will be queued in the recv
 handler,
 +* after that we can call the unregister_mad_agent
 +*/
 flush_workqueue(cm.wq);
 +   ib_unregister_mad_agent(port-mad_agent);
 cm_remove_port_fs(port);
 }
 device_unregister(cm_dev-device);


 Hello Erez,

 How about splitting unregister_mad_agent() into two functions, one that
 stops the invocation of the receive callbacks and another one that cancels
 all sends ? If the new function that stops the receive callbacks would be
 invoked before flush_workqueue(), would that be safe ?
No, still works that are pending in the queue will need the mad_agent,
the best is to finish all the pending works, and not let new works to
come in.

 Would that allow to
 drop the new flag going_down since the workqueue implementation already
 sets __WQ_DRAINING ?

 Thanks,

 Bart.

 --
 To unsubscribe from this list: send the line unsubscribe linux-rdma in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] IB/cm: Do not queue a work when the device is going to be removed

2015-06-25 Thread Erez Shitrit


Whenever ib_cm gets remove_one call, like when there is a hot-unplug
event, the driver should mark itself as going_down and confirm that no
new works are going to be queued for that device.
so, the order of the actions are:
1. mark the going_down bit.
2. flush the wq.
3. [make sure no new works for that device.]
4. unregister mad agent.

otherwise, works that are already queued can be scheduled after the mad
agent was freed.

Signed-off-by: Erez Shitrit ere...@mellanox.com
---
 drivers/infiniband/core/cm.c | 61 +++-
 1 file changed, 55 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index dbd..3a972eb 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -169,6 +169,7 @@ struct cm_device {
struct ib_device *ib_device;
struct device *device;
u8 ack_delay;
+   int going_down;
struct cm_port *port[0];
 };
 
@@ -805,6 +806,11 @@ static void cm_enter_timewait(struct cm_id_private 
*cm_id_priv)
 {
int wait_time;
unsigned long flags;
+   struct cm_device *cm_dev;
+
+   cm_dev = ib_get_client_data(cm_id_priv-id.device, cm_client);
+   if (!cm_dev)
+   return;
 
spin_lock_irqsave(cm.lock, flags);
cm_cleanup_timewait(cm_id_priv-timewait_info);
@@ -818,8 +824,14 @@ static void cm_enter_timewait(struct cm_id_private 
*cm_id_priv)
 */
cm_id_priv-id.state = IB_CM_TIMEWAIT;
wait_time = cm_convert_to_ms(cm_id_priv-av.timeout);
-   queue_delayed_work(cm.wq, cm_id_priv-timewait_info-work.work,
-  msecs_to_jiffies(wait_time));
+
+   /* Check if the device started its remove_one */
+   spin_lock_irq(cm.lock);
+   if (!cm_dev-going_down)
+   queue_delayed_work(cm.wq, cm_id_priv-timewait_info-work.work,
+  msecs_to_jiffies(wait_time));
+   spin_unlock_irq(cm.lock);
+
cm_id_priv-timewait_info = NULL;
 }
 
@@ -3305,6 +3317,11 @@ static int cm_establish(struct ib_cm_id *cm_id)
struct cm_work *work;
unsigned long flags;
int ret = 0;
+   struct cm_device *cm_dev;
+
+   cm_dev = ib_get_client_data(cm_id-device, cm_client);
+   if (!cm_dev)
+   return -ENODEV;
 
work = kmalloc(sizeof *work, GFP_ATOMIC);
if (!work)
@@ -3343,7 +3360,17 @@ static int cm_establish(struct ib_cm_id *cm_id)
work-remote_id = cm_id-remote_id;
work-mad_recv_wc = NULL;
work-cm_event.event = IB_CM_USER_ESTABLISHED;
-   queue_delayed_work(cm.wq, work-work, 0);
+
+   /* Check if the device started its remove_one */
+   spin_lock_irq(cm.lock);
+   if (!cm_dev-going_down) {
+   queue_delayed_work(cm.wq, work-work, 0);
+   } else {
+   kfree(work);
+   ret = -ENODEV;
+   }
+   spin_unlock_irq(cm.lock);
+
 out:
return ret;
 }
@@ -3394,6 +3421,7 @@ static void cm_recv_handler(struct ib_mad_agent 
*mad_agent,
enum ib_cm_event_type event;
u16 attr_id;
int paths = 0;
+   int going_down = 0;
 
switch (mad_recv_wc-recv_buf.mad-mad_hdr.attr_id) {
case CM_REQ_ATTR_ID:
@@ -3452,7 +3480,19 @@ static void cm_recv_handler(struct ib_mad_agent 
*mad_agent,
work-cm_event.event = event;
work-mad_recv_wc = mad_recv_wc;
work-port = port;
-   queue_delayed_work(cm.wq, work-work, 0);
+
+   /* Check if the device started its remove_one */
+   spin_lock_irq(cm.lock);
+   if (!port-cm_dev-going_down)
+   queue_delayed_work(cm.wq, work-work, 0);
+   else
+   going_down = 1;
+   spin_unlock_irq(cm.lock);
+
+   if (going_down) {
+   kfree(work);
+   ib_free_recv_mad(mad_recv_wc);
+   }
 }
 
 static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
@@ -3771,7 +3811,7 @@ static void cm_add_one(struct ib_device *ib_device)
 
cm_dev-ib_device = ib_device;
cm_get_ack_delay(cm_dev);
-
+   cm_dev-going_down = 0;
cm_dev-device = device_create(cm_class, ib_device-dev,
   MKDEV(0, 0), NULL,
   %s, ib_device-name);
@@ -3864,14 +3904,23 @@ static void cm_remove_one(struct ib_device *ib_device)
list_del(cm_dev-list);
write_unlock_irqrestore(cm.device_lock, flags);
 
+   spin_lock_irq(cm.lock);
+   cm_dev-going_down = 1;
+   spin_unlock_irq(cm.lock);
+
for (i = 1; i = ib_device-phys_port_cnt; i++) {
if (!rdma_cap_ib_cm(ib_device, i))
continue;
 
port = cm_dev-port[i-1];
ib_modify_port(ib_device, port-port_num, 0, port_modify);
-   ib_unregister_mad_agent(port-mad_agent);
+   /*
+* We flush the queue here after

Re: [PATCH v3] IB/ipoib: Scatter-Gather support in connected mode

2015-06-10 Thread Erez Shitrit

On Mon, May 11, 2015 at 1:04 PM, Yuval Shaia yuval.sh...@oracle.com wrote:
 By default, IPoIB-CM driver uses 64k MTU. Larger MTU gives better performance.
 This MTU plus overhead puts the memory allocation for IP based packets at 32
 4k pages (order 5), which have to be contiguous.
 When the system memory under pressure, it was observed that allocating 128k
 contiguous physical memory is difficult and causes serious errors (such as
 system becomes unusable).

 This enhancement resolve the issue by removing the physically contiguous 
 memory
 requirement using Scatter/Gather feature that exists in Linux stack.

 With this fix Scatter-Gather will be supported also in connected mode.

 This change reverts the some of the change made in commit
 e112373fd6aa280bd2cbc0d5cc3809115325a1be
 (IPoIB/cm: Reduce connected mode TX object size).

 The ability to use SG in IPoIB CM is possible because the coupling
 between NETIF_F_SG and NETIF_F_CSUM was removed in commit
 ec5f061564238892005257c83565a0b58ec79295
 (net: Kill link between CSUM and SG features.)

 Signed-off-by: Yuval Shaia yuval.sh...@oracle.com
 ---
 v2: Enhance commit log message

 I'd like to thank John Sobecki john.sobe...@oracle.com and
 Christian Marie christ...@ponies.io for helping me with reviewing this 
 patch.
 I will be glad if you can ACK it.

 v3: Add an empty line to start a new paragraph and use 12 hex-digits 
 identifier
 ---
  drivers/infiniband/ulp/ipoib/ipoib.h  |4 +-
  drivers/infiniband/ulp/ipoib/ipoib_cm.c   |  107 
 +++--
  drivers/infiniband/ulp/ipoib/ipoib_ib.c   |3 +-
  drivers/infiniband/ulp/ipoib/ipoib_main.c |2 +-
  4 files changed, 92 insertions(+), 24 deletions(-)

 diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
 b/drivers/infiniband/ulp/ipoib/ipoib.h
 index bd94b0a..372c8a2 100644
 --- a/drivers/infiniband/ulp/ipoib/ipoib.h
 +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
 @@ -239,7 +239,7 @@ struct ipoib_cm_tx {
 struct net_device   *dev;
 struct ipoib_neigh  *neigh;
 struct ipoib_path   *path;
 -   struct ipoib_cm_tx_buf *tx_ring;
 +   struct ipoib_tx_buf *tx_ring;
 unsigned tx_head;
 unsigned tx_tail;
 unsigned longflags;
 @@ -504,6 +504,8 @@ int ipoib_mcast_stop_thread(struct net_device *dev);
  void ipoib_mcast_dev_down(struct net_device *dev);
  void ipoib_mcast_dev_flush(struct net_device *dev);

 +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
 +
  #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
  int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
 b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
 index 56959ad..eae106e 100644
 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
 +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
 @@ -88,6 +88,31 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv 
 *priv, int frags,
 ib_dma_unmap_page(priv-ca, mapping[i + 1], PAGE_SIZE, 
 DMA_FROM_DEVICE);
  }

 +static void ipoib_cm_dma_unmap_tx(struct ipoib_dev_priv *priv,
 + struct ipoib_tx_buf *tx_req)
 +{
 +   struct sk_buff *skb;
 +   int i, offs;
 +
 +   skb = tx_req-skb;
 +   if (skb_shinfo(skb)-nr_frags) {
 +   offs = 0;
 +   if (skb_headlen(skb)) {
 +   ib_dma_unmap_single(priv-ca, tx_req-mapping[0],
 +   skb_headlen(skb), DMA_TO_DEVICE);
 +   offs = 1;
 +   }
 +   for (i = 0; i  skb_shinfo(skb)-nr_frags; ++i) {
 +   const skb_frag_t *frag = skb_shinfo(skb)-frags[i];
 +
 +   ib_dma_unmap_page(priv-ca, tx_req-mapping[i + offs],
 + skb_frag_size(frag), DMA_TO_DEVICE);
 +   }
 +   } else
 +   ib_dma_unmap_single(priv-ca, tx_req-mapping[0], skb-len,
 +   DMA_TO_DEVICE);
 +}
 +
  static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
  {
 struct ipoib_dev_priv *priv = netdev_priv(dev);
 @@ -707,11 +732,39 @@ static inline int post_send(struct ipoib_dev_priv *priv,
 return ib_post_send(tx-qp, priv-tx_wr, bad_wr);
  }

 +static inline int post_send_sg(struct ipoib_dev_priv *priv,
 +  struct ipoib_cm_tx *tx,
 +  unsigned int wr_id,
 +  struct sk_buff *skb,
 +  u64 mapping[MAX_SKB_FRAGS + 1])
 +{
 +   struct ib_send_wr *bad_wr;
 +   int i, off;
 +   skb_frag_t *frags = skb_shinfo(skb)-frags;
 +   int nr_frags = skb_shinfo(skb)-nr_frags;
 +
 +   if (skb_headlen(skb)) {
 +   priv-tx_sge[0].addr = mapping[0];
 +   priv-tx_sge[0].length = skb_headlen(skb);
 +

Re: [PATCH v3] IB/ipoib: Scatter-Gather support in connected mode

2015-06-10 Thread Erez Shitrit

On Mon, May 11, 2015 at 1:04 PM, Yuval Shaia yuval.sh...@oracle.com wrote:
 By default, IPoIB-CM driver uses 64k MTU. Larger MTU gives better performance.
 This MTU plus overhead puts the memory allocation for IP based packets at 32
 4k pages (order 5), which have to be contiguous.
 When the system memory under pressure, it was observed that allocating 128k
 contiguous physical memory is difficult and causes serious errors (such as
 system becomes unusable).

 This enhancement resolve the issue by removing the physically contiguous 
 memory
 requirement using Scatter/Gather feature that exists in Linux stack.

 With this fix Scatter-Gather will be supported also in connected mode.

 This change reverts the some of the change made in commit
 e112373fd6aa280bd2cbc0d5cc3809115325a1be
 (IPoIB/cm: Reduce connected mode TX object size).

 The ability to use SG in IPoIB CM is possible because the coupling
 between NETIF_F_SG and NETIF_F_CSUM was removed in commit
 ec5f061564238892005257c83565a0b58ec79295
 (net: Kill link between CSUM and SG features.)

 Signed-off-by: Yuval Shaia yuval.sh...@oracle.com
 ---
 v2: Enhance commit log message

 I'd like to thank John Sobecki john.sobe...@oracle.com and
 Christian Marie christ...@ponies.io for helping me with reviewing this 
 patch.
 I will be glad if you can ACK it.

 v3: Add an empty line to start a new paragraph and use 12 hex-digits 
 identifier
 ---
  drivers/infiniband/ulp/ipoib/ipoib.h  |4 +-
  drivers/infiniband/ulp/ipoib/ipoib_cm.c   |  107 
 +++--
  drivers/infiniband/ulp/ipoib/ipoib_ib.c   |3 +-
  drivers/infiniband/ulp/ipoib/ipoib_main.c |2 +-
  4 files changed, 92 insertions(+), 24 deletions(-)

 diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
 b/drivers/infiniband/ulp/ipoib/ipoib.h
 index bd94b0a..372c8a2 100644
 --- a/drivers/infiniband/ulp/ipoib/ipoib.h
 +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
 @@ -239,7 +239,7 @@ struct ipoib_cm_tx {
 struct net_device   *dev;
 struct ipoib_neigh  *neigh;
 struct ipoib_path   *path;
 -   struct ipoib_cm_tx_buf *tx_ring;
 +   struct ipoib_tx_buf *tx_ring;
 unsigned tx_head;
 unsigned tx_tail;
 unsigned longflags;
 @@ -504,6 +504,8 @@ int ipoib_mcast_stop_thread(struct net_device *dev);
  void ipoib_mcast_dev_down(struct net_device *dev);
  void ipoib_mcast_dev_flush(struct net_device *dev);

 +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
 +
  #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
  struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
  int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
 b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
 index 56959ad..eae106e 100644
 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
 +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
 @@ -88,6 +88,31 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv 
 *priv, int frags,
 ib_dma_unmap_page(priv-ca, mapping[i + 1], PAGE_SIZE, 
 DMA_FROM_DEVICE);
  }

 +static void ipoib_cm_dma_unmap_tx(struct ipoib_dev_priv *priv,
 + struct ipoib_tx_buf *tx_req)
 +{
 +   struct sk_buff *skb;
 +   int i, offs;
 +
 +   skb = tx_req-skb;
 +   if (skb_shinfo(skb)-nr_frags) {
 +   offs = 0;
 +   if (skb_headlen(skb)) {
 +   ib_dma_unmap_single(priv-ca, tx_req-mapping[0],
 +   skb_headlen(skb), DMA_TO_DEVICE);
 +   offs = 1;
 +   }
 +   for (i = 0; i  skb_shinfo(skb)-nr_frags; ++i) {
 +   const skb_frag_t *frag = skb_shinfo(skb)-frags[i];
 +
 +   ib_dma_unmap_page(priv-ca, tx_req-mapping[i + offs],
 + skb_frag_size(frag), DMA_TO_DEVICE);
 +   }
 +   } else
 +   ib_dma_unmap_single(priv-ca, tx_req-mapping[0], skb-len,
 +   DMA_TO_DEVICE);
 +}
 +
  static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
  {
 struct ipoib_dev_priv *priv = netdev_priv(dev);
 @@ -707,11 +732,39 @@ static inline int post_send(struct ipoib_dev_priv *priv,
 return ib_post_send(tx-qp, priv-tx_wr, bad_wr);
  }

 +static inline int post_send_sg(struct ipoib_dev_priv *priv,
 +  struct ipoib_cm_tx *tx,
 +  unsigned int wr_id,
 +  struct sk_buff *skb,
 +  u64 mapping[MAX_SKB_FRAGS + 1])
 +{
 +   struct ib_send_wr *bad_wr;
 +   int i, off;
 +   skb_frag_t *frags = skb_shinfo(skb)-frags;
 +   int nr_frags = skb_shinfo(skb)-nr_frags;
 +
 +   if (skb_headlen(skb)) {
 +   priv-tx_sge[0].addr = mapping[0];
 +   priv-tx_sge[0].length = skb_headlen(skb);
 +

Re: [PATCH] IB/ipoib: Change To max_cm_mtu when changing mode to connected

2015-06-09 Thread Erez Shitrit

On Tue, Jun 9, 2015 at 12:04 AM, Doug Ledford dledf...@redhat.com wrote:
 On Mon, 2015-06-08 at 10:42 -0600, Jason Gunthorpe wrote:
 On Sun, Jun 07, 2015 at 01:36:11PM +0300, Erez Shitrit wrote:
 
  When switching between modes (datagram / connected) change the MTU
  accordingly.
  datagram mode up to 4K, connected mode up to (64K - 0x10).

 Is this a bug fix (describe the user visible impact)? Should it go to
 -stable? Add a Fixes: line?

 I'm not sure I would call it a bug.  Setting the MTU to max
 automatically is a policy decision more than anything else.  Currently,
 you have to enable connected mode, then set the MTU.  Both initscripts
 and NetworkManager do this for you on Red Hat, so the user sees no
 difference here.  I can't speak to other OSes.  I'm OK with setting
 connected mode MTU to max by default once we get the scatter/gather
 support for IPoIB added.

Hi Doug,
There is no connection to the scatter/gather patch in that fix.


 --
 Doug Ledford dledf...@redhat.com
   GPG KeyID: 0E572FDD

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] IB/ipoib: Change To max_cm_mtu when changing mode to connected

2015-06-09 Thread Erez Shitrit

On Tue, Jun 9, 2015 at 12:04 AM, Doug Ledford dledf...@redhat.com wrote:
 On Mon, 2015-06-08 at 10:42 -0600, Jason Gunthorpe wrote:
 On Sun, Jun 07, 2015 at 01:36:11PM +0300, Erez Shitrit wrote:
 
  When switching between modes (datagram / connected) change the MTU
  accordingly.
  datagram mode up to 4K, connected mode up to (64K - 0x10).

 Is this a bug fix (describe the user visible impact)? Should it go to
 -stable? Add a Fixes: line?

 I'm not sure I would call it a bug.  Setting the MTU to max
 automatically is a policy decision more than anything else.  Currently,
 you have to enable connected mode, then set the MTU.  Both initscripts
 and NetworkManager do this for you on Red Hat, so the user sees no
 difference here.  I can't speak to other OSes.  I'm OK with setting

The only real reason for a user to switch to CM mode is for the
performance he can get, and this is only by the huge MTU the CM gives,
so it somehow should be part of the mode setting.
As you said RH does it (in the new versions of its OS's only) via
scripting outside of the driver code, other vendor doesn't.

 connected mode MTU to max by default once we get the scatter/gather
 support for IPoIB added.

OK, great. Thanks.

 --
 Doug Ledford dledf...@redhat.com
   GPG KeyID: 0E572FDD

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] IB/ipoib: Change To max_cm_mtu when changing mode to connected

2015-06-07 Thread Erez Shitrit


When switching between modes (datagram / connected) change the MTU
accordingly.
datagram mode up to 4K, connected mode up to (64K - 0x10).

Signed-off-by: ELi Cohen e...@mellanox.com
Signed-off-by: Erez Shitrit ere...@mellanox.com
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 9e1b203..5b74184 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -232,6 +232,7 @@ int ipoib_set_mode(struct net_device *dev, const char *buf)
ipoib_warn(priv, enabling connected mode 
   will cause multicast packet drops\n);
netdev_update_features(dev);
+   dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
rtnl_unlock();
priv-tx_wr.send_flags = ~IB_SEND_IP_CSUM;
 
-- 
1.7.11.3

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state

2015-04-16 Thread Erez Shitrit


On 4/15/2015 8:20 PM, Devesh Sharma wrote:

-Original Message-
From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma-
ow...@vger.kernel.org] On Behalf Of Or Gerlitz
Sent: Thursday, April 02, 2015 4:09 PM
To: Roland Dreier; Doug Ledford
Cc: linux-rdma@vger.kernel.org; Erez Shitrit; Tal Alon; Amir Vadai; Or Gerlitz
Subject: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state

From: Erez Shitrit ere...@mellanox.com

As the result of a completion error the QP can moved to SQE state by the
hardware. Since it's not the Error state, there are no flushes and hence the
driver doesn't know about that.

As per spec, QP transition to SQE causes flush completion for subsequent WQEs, 
the description is telling other way. Am I missing something?


No you are not :) . the description tries to say the following: the 
driver cannot distinguish between IB_WC_WR_FLUSH_ERR that threated as 
IB_WC_SUCCESS to IB_WC_WR_FLUSH_ERR that comes after other errors that 
take the QP to SQE,
The driver must recognize the first error that is not IB_WC_WR_FLUSH_ERR 
and handle accordingly.
For example, the driver can take the QP to ERROR state as a part of its 
life cycle (drain it, driver down, etc.) and at these situations many 
IB_WC_WR_FLUSH_ERR return and no need to change the state of the QP, it 
is already under the handling of the driver, which is not when other 
error comes. this is the intention of that patch, to return the QP back 
to life after that (un-handed) cases.



The fix creates a task that after completion with error which is not a flush
tracks the QP state and if it is in SQE state moves it back to RTS.

Signed-off-by: Erez Shitrit ere...@mellanox.com
Signed-off-by: Or Gerlitz ogerl...@mellanox.com
---
  drivers/infiniband/ulp/ipoib/ipoib.h|5 +++
  drivers/infiniband/ulp/ipoib/ipoib_ib.c |   59
++-
  2 files changed, 63 insertions(+), 1 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h
b/drivers/infiniband/ulp/ipoib/ipoib.h
index 769044c..2703d9a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -299,6 +299,11 @@ struct ipoib_neigh_table {
struct completion   deleted;
  };

+struct ipoib_qp_state_validate {
+   struct work_struct work;
+   struct ipoib_dev_priv   *priv;
+};
+
  /*
   * Device private locking: network stack tx_lock protects members used
   * in TX fast path, lock protects everything else.  lock nests inside diff 
--git
a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 29b376d..63b92cb 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
}
  }

+/*
+ * As the result of a completion error the QP Can be transferred to SQE states.
+ * The function checks if the (send)QP is in SQE state and
+ * moves it back to RTS state, that in order to have it functional again.
+ */
+static void ipoib_qp_state_validate_work(struct work_struct *work) {
+   struct ipoib_qp_state_validate *qp_work =
+   container_of(work, struct ipoib_qp_state_validate, work);
+
+   struct ipoib_dev_priv *priv = qp_work-priv;
+   struct ib_qp_attr qp_attr;
+   struct ib_qp_init_attr query_init_attr;
+   int ret;
+
+   ret = ib_query_qp(priv-qp, qp_attr, IB_QP_STATE, query_init_attr);
+   if (ret) {
+   ipoib_warn(priv, %s: Failed to query QP ret: %d\n,
+  __func__, ret);
+   goto free_res;
+   }
+   pr_info(%s: QP: 0x%x is in state: %d\n,
+   __func__, priv-qp-qp_num, qp_attr.qp_state);
+
+   /* currently support only in SQE-RTS transition*/
+   if (qp_attr.qp_state == IB_QPS_SQE) {
+   qp_attr.qp_state = IB_QPS_RTS;
+
+   ret = ib_modify_qp(priv-qp, qp_attr, IB_QP_STATE);
+   if (ret) {
+   pr_warn(failed(%d) modify QP:0x%x SQE-RTS\n,
+   ret, priv-qp-qp_num);
+   goto free_res;
+   }
+   pr_info(%s: QP: 0x%x moved from IB_QPS_SQE to
IB_QPS_RTS\n,
+   __func__, priv-qp-qp_num);
+   } else {
+   pr_warn(QP (%d) will stay in state: %d\n,
+   priv-qp-qp_num, qp_attr.qp_state);
+   }
+
+free_res:
+   kfree(qp_work);
+}
+
  static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)  {
struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -358,10 +403,22
@@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc
*wc)
netif_wake_queue(dev);

if (wc-status != IB_WC_SUCCESS 
-   wc-status != IB_WC_WR_FLUSH_ERR)
+   wc-status != IB_WC_WR_FLUSH_ERR) {
+   struct ipoib_qp_state_validate *qp_work;
ipoib_warn(priv, failed send event

Re: [PATCH linux-next 1/4] infiniband/ipoib: fix possible NULL pointer dereference in ipoib_get_iflink

2015-04-16 Thread Erez Shitrit

On Wed, Apr 15, 2015 at 7:06 PM, Jason Gunthorpe
jguntho...@obsidianresearch.com wrote:
 On Wed, Apr 15, 2015 at 09:17:14AM +0300, Erez Shitrit wrote:
 +   /* parent interface */
 +   if (!test_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags))
 +   return dev-ifindex;
 +
 +   /* child/vlan interface */
 +   if (!priv-parent)
 +   return -1;

 Like was said for other drivers, I can't see how parent can be null
 while IPOIB_FLAG_SUBINTERFACE is set. Drop the last if.

 It can, at least for ipoib child interface (AKA vlan), you can't
 control the call for that ndo and it can be called before the parent
 was set.

 If the ndo can be called before the netdev private structures are fully
 prepared then we have another bug, and returning -1 or 0 is not the right
 answer anyhow.

 For safety, fold this into your patch.

OK, will do that.


 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c 
 b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
 index 9fad7b5ac8b9..e62b007adf5d 100644
 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
 +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
 @@ -58,6 +58,7 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct 
 ipoib_dev_priv *priv,
 /* MTU will be reset when mcast join happens */
 priv-dev-mtu   = IPOIB_UD_MTU(priv-max_ib_mtu);
 priv-mcast_mtu  = priv-admin_mtu = priv-dev-mtu;
 +   priv-parent = ppriv-dev;
 set_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags);

 result = ipoib_set_dev_features(priv, ppriv-ca);
 @@ -84,8 +85,6 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct 
 ipoib_dev_priv *priv,
 goto register_failed;
 }

 -   priv-parent = ppriv-dev;
 -
 ipoib_create_debug_files(priv-dev);

 /* RTNL childs don't need proprietary sysfs entries */
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next] IB/ipoib: Fix ndo_get_iflink

2015-04-16 Thread Erez Shitrit

Currently, iflink of the parent interface was always accessed, event 
when interface didn't have a parent and hence we rashed there.

Handle the interface types properly: for a child interface, return
the ifindex of the parent, for parent interface, return its ifindex.

For child devices, make sure to set the parent pointer prior to
invoking register_netdevice(), this allows the new ndo to be called
by the stack immediately after the child device is registered.

Fixes: 5aa7add8f14b ('infiniband/ipoib: implement ndo_get_iflink')
Reported-by: Honggang Li ho...@redhat.com
Signed-off-by: Erez Shitrit ere...@mellanox.com
Signed-off-by: Honggang Li ho...@redhat.com
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 5 +
 drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 3 +--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 657b89b..915ad04 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -846,6 +846,11 @@ static int ipoib_get_iflink(const struct net_device *dev)
 {
struct ipoib_dev_priv *priv = netdev_priv(dev);
 
+   /* parent interface */
+   if (!test_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags))
+   return dev-ifindex;
+
+   /* child/vlan interface */
return priv-parent-ifindex;
 }
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c 
b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index 4dd1313..fca1a88 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -58,6 +58,7 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct 
ipoib_dev_priv *priv,
/* MTU will be reset when mcast join happens */
priv-dev-mtu   = IPOIB_UD_MTU(priv-max_ib_mtu);
priv-mcast_mtu  = priv-admin_mtu = priv-dev-mtu;
+   priv-parent = ppriv-dev;
set_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags);
 
result = ipoib_set_dev_features(priv, ppriv-ca);
@@ -84,8 +85,6 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct 
ipoib_dev_priv *priv,
goto register_failed;
}
 
-   priv-parent = ppriv-dev;
-
ipoib_create_debug_files(priv-dev);
 
/* RTNL childs don't need proprietary sysfs entries */
-- 
1.7.11.3

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V1 net-next] IB/ipoib: Fix ndo_get_iflink

2015-04-16 Thread Erez Shitrit

Currently, iflink of the parent interface was always accessed, even 
when interface didn't have a parent and hence we crashed there.

Handle the interface types properly: for a child interface, return
the ifindex of the parent, for parent interface, return its ifindex.

For child devices, make sure to set the parent pointer prior to
invoking register_netdevice(), this allows the new ndo to be called
by the stack immediately after the child device is registered.

Fixes: 5aa7add8f14b ('infiniband/ipoib: implement ndo_get_iflink')
Reported-by: Honggang Li ho...@redhat.com
Signed-off-by: Erez Shitrit ere...@mellanox.com
Signed-off-by: Honggang Li ho...@redhat.com
---

changes from V0:
 - fixed two typos in the change-log

 drivers/infiniband/ulp/ipoib/ipoib_main.c | 5 +
 drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 3 +--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 657b89b..915ad04 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -846,6 +846,11 @@ static int ipoib_get_iflink(const struct net_device *dev)
 {
struct ipoib_dev_priv *priv = netdev_priv(dev);
 
+   /* parent interface */
+   if (!test_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags))
+   return dev-ifindex;
+
+   /* child/vlan interface */
return priv-parent-ifindex;
 }
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c 
b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
index 4dd1313..fca1a88 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -58,6 +58,7 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct 
ipoib_dev_priv *priv,
/* MTU will be reset when mcast join happens */
priv-dev-mtu   = IPOIB_UD_MTU(priv-max_ib_mtu);
priv-mcast_mtu  = priv-admin_mtu = priv-dev-mtu;
+   priv-parent = ppriv-dev;
set_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags);
 
result = ipoib_set_dev_features(priv, ppriv-ca);
@@ -84,8 +85,6 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct 
ipoib_dev_priv *priv,
goto register_failed;
}
 
-   priv-parent = ppriv-dev;
-
ipoib_create_debug_files(priv-dev);
 
/* RTNL childs don't need proprietary sysfs entries */
-- 
1.7.11.3

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state

2015-04-16 Thread Erez Shitrit

On Thu, Apr 16, 2015 at 4:29 PM, Doug Ledford dledf...@redhat.com wrote:

 On Apr 15, 2015, at 9:29 AM, Erez Shitrit ere...@dev.mellanox.co.il wrote:

 Hi Doug

 (now with reply to all :) )

 On Wed, Apr 15, 2015 at 3:08 PM, Doug Ledford dledf...@redhat.com wrote:

 On Apr 2, 2015, at 6:39 AM, Or Gerlitz ogerl...@mellanox.com wrote:

 From: Erez Shitrit ere...@mellanox.com

 As the result of a completion error the QP can moved to SQE state by
 the hardware. Since it's not the Error state, there are no flushes
 and hence the driver doesn't know about that.

 The fix creates a task that after completion with error which is not a
 flush tracks the QP state and if it is in SQE state moves it back to RTS.

 I assume you found this particular issue while tracking down the cause of 
 the problem that you fixed in patch #6?  Otherwise, this is a “never should 
 happen” situation, yes?


 Not only. it happened in the case that described in patch#6 and also
 in some other errors that caused the FW/HW to decide move the UD QO to
 SQE state (in the category of bad thing happened..  like damaged
 WQE, FW problem, etc.)
 The patch originally was written to solve such cases which we really
 saw, at testing, customers etc. and not for patch#6

 Signed-off-by: Erez Shitrit ere...@mellanox.com
 Signed-off-by: Or Gerlitz ogerl...@mellanox.com
 ---
 drivers/infiniband/ulp/ipoib/ipoib.h|5 +++
 drivers/infiniband/ulp/ipoib/ipoib_ib.c |   59 
 ++-
 2 files changed, 63 insertions(+), 1 deletions(-)

 diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
 b/drivers/infiniband/ulp/ipoib/ipoib.h
 index 769044c..2703d9a 100644
 --- a/drivers/infiniband/ulp/ipoib/ipoib.h
 +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
 @@ -299,6 +299,11 @@ struct ipoib_neigh_table {
  struct completion   deleted;
 };

 +struct ipoib_qp_state_validate {
 + struct work_struct work;
 + struct ipoib_dev_priv   *priv;
 +};

 Why did you choose to do an allocated work struct?  All of the other tasks 
 we have use a static work struct that’s part of the ipoib private dev, so 
 this is a bit out of the norm.  In its current form, you only use this on 
 the single UD queue pair that the dev uses, and not ever on any connected 
 mode queue pairs, so it could be static.  Or were you looking at the 
 possibility of making the task work on the CM queue pairs and so left it 
 allocated?  And if that, why not include a reference to the broken queue 
 pair in the work struct instead of always using priv-qp?


 It looks (at least for me) more elegant to create on-the-fly work
 create/queue and delete at the end (BTW, I saw it many times, other
 than ipoib).
 There is no need for that in the CM mode, there the connection and the
 QP are destroyed, and new one will be created, only in UD mode the QP
 is forever.

 I knew it wasn’t needed in CM mode currently, but I didn’t know if there was 
 a plan in your mind to change the way CM worked so that it tried to rescue a 
 queue pair instead of destroying and starting over.

 However, I really think we need to be consistent in this driver.  If we 
 allocate our work struct in one and only one place, then we run into the 
 problem where someone working on the driver in the future has to try and 
 remember “Gee, is this the one function that allocates our work struct and we 
 must then free it to avoid leaking memory, or was that a different function?” 
  I would overlook that if we had a plan that involved this function ever 
 operating on anything other than the UD QP, in which case you would have to 
 allocate a struct to designate the QP in question, and you would have to 
 allocate a struct per instance because it would technically be possible to 
 have more than one QP in an SQE state at once.  But if that’s not the plan, 
 then please rework this patch to add the work struct to the ipoib private dev 
 like we do with all the other work structs.  I’d like to swap this one out of 
 my tree for your new one ASAP.


There is a plan to have more than one UD QP per device, in order to
have multi-queue interfaces (not according to the CM QP but with the
same idea that you said, many objects that can use different work
each).
So, i think that this way to allocate work object is still reasonable.


 /*
 * Device private locking: network stack tx_lock protects members used
 * in TX fast path, lock protects everything else.  lock nests inside
 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
 b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
 index 29b376d..63b92cb 100644
 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
 +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
 @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
  }
 }

 +/*
 + * As the result of a completion error the QP Can be transferred to SQE 
 states.
 + * The function checks if the (send)QP is in SQE state and
 + * moves it back to RTS state, that in order to have it functional again

Re: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state

2015-04-15 Thread Erez Shitrit

Hi Doug

(now with reply to all :) )

On Wed, Apr 15, 2015 at 3:08 PM, Doug Ledford dledf...@redhat.com wrote:

 On Apr 2, 2015, at 6:39 AM, Or Gerlitz ogerl...@mellanox.com wrote:

 From: Erez Shitrit ere...@mellanox.com

 As the result of a completion error the QP can moved to SQE state by
 the hardware. Since it's not the Error state, there are no flushes
 and hence the driver doesn't know about that.

 The fix creates a task that after completion with error which is not a
 flush tracks the QP state and if it is in SQE state moves it back to RTS.

 I assume you found this particular issue while tracking down the cause of the 
 problem that you fixed in patch #6?  Otherwise, this is a “never should 
 happen” situation, yes?


Not only. it happened in the case that described in patch#6 and also
in some other errors that caused the FW/HW to decide move the UD QO to
SQE state (in the category of bad thing happened..  like damaged
WQE, FW problem, etc.)
The patch originally was written to solve such cases which we really
saw, at testing, customers etc. and not for patch#6

 Signed-off-by: Erez Shitrit ere...@mellanox.com
 Signed-off-by: Or Gerlitz ogerl...@mellanox.com
 ---
 drivers/infiniband/ulp/ipoib/ipoib.h|5 +++
 drivers/infiniband/ulp/ipoib/ipoib_ib.c |   59 
 ++-
 2 files changed, 63 insertions(+), 1 deletions(-)

 diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
 b/drivers/infiniband/ulp/ipoib/ipoib.h
 index 769044c..2703d9a 100644
 --- a/drivers/infiniband/ulp/ipoib/ipoib.h
 +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
 @@ -299,6 +299,11 @@ struct ipoib_neigh_table {
   struct completion   deleted;
 };

 +struct ipoib_qp_state_validate {
 + struct work_struct work;
 + struct ipoib_dev_priv   *priv;
 +};

 Why did you choose to do an allocated work struct?  All of the other tasks we 
 have use a static work struct that’s part of the ipoib private dev, so this 
 is a bit out of the norm.  In its current form, you only use this on the 
 single UD queue pair that the dev uses, and not ever on any connected mode 
 queue pairs, so it could be static.  Or were you looking at the possibility 
 of making the task work on the CM queue pairs and so left it allocated?  And 
 if that, why not include a reference to the broken queue pair in the work 
 struct instead of always using priv-qp?


It looks (at least for me) more elegant to create on-the-fly work
create/queue and delete at the end (BTW, I saw it many times, other
than ipoib).
There is no need for that in the CM mode, there the connection and the
QP are destroyed, and new one will be created, only in UD mode the QP
is forever.

 /*
  * Device private locking: network stack tx_lock protects members used
  * in TX fast path, lock protects everything else.  lock nests inside
 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
 b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
 index 29b376d..63b92cb 100644
 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
 +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
 @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
   }
 }

 +/*
 + * As the result of a completion error the QP Can be transferred to SQE 
 states.
 + * The function checks if the (send)QP is in SQE state and
 + * moves it back to RTS state, that in order to have it functional again.
 + */
 +static void ipoib_qp_state_validate_work(struct work_struct *work)
 +{
 + struct ipoib_qp_state_validate *qp_work =
 + container_of(work, struct ipoib_qp_state_validate, work);
 +
 + struct ipoib_dev_priv *priv = qp_work-priv;
 + struct ib_qp_attr qp_attr;
 + struct ib_qp_init_attr query_init_attr;
 + int ret;
 +
 + ret = ib_query_qp(priv-qp, qp_attr, IB_QP_STATE, query_init_attr);
 + if (ret) {
 + ipoib_warn(priv, %s: Failed to query QP ret: %d\n,
 +__func__, ret);
 + goto free_res;
 + }
 + pr_info(%s: QP: 0x%x is in state: %d\n,
 + __func__, priv-qp-qp_num, qp_attr.qp_state);
 +
 + /* currently support only in SQE-RTS transition*/
 + if (qp_attr.qp_state == IB_QPS_SQE) {
 + qp_attr.qp_state = IB_QPS_RTS;
 +
 + ret = ib_modify_qp(priv-qp, qp_attr, IB_QP_STATE);
 + if (ret) {
 + pr_warn(failed(%d) modify QP:0x%x SQE-RTS\n,
 + ret, priv-qp-qp_num);
 + goto free_res;
 + }
 + pr_info(%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n,
 + __func__, priv-qp-qp_num);
 + } else {
 + pr_warn(QP (%d) will stay in state: %d\n,
 + priv-qp-qp_num, qp_attr.qp_state);
 + }
 +
 +free_res:
 + kfree(qp_work);
 +}
 +
 static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 {
   struct ipoib_dev_priv *priv = netdev_priv(dev);
 @@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc

Re: [PATCH linux-next 1/4] infiniband/ipoib: fix possible NULL pointer dereference in ipoib_get_iflink

2015-04-15 Thread Erez Shitrit


On 4/14/2015 11:41 PM, Jason Gunthorpe wrote:

On Tue, Apr 14, 2015 at 07:30:03PM +0300, Erez Shitrit wrote:


diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 657b89b..11ea6e2 100644
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -846,7 +846,10 @@ static int ipoib_get_iflink(const struct net_device *dev)
  {
 struct ipoib_dev_priv *priv = netdev_priv(dev);

-   return priv-parent-ifindex;
+   if (priv  priv-parent)
+   return priv-parent-ifindex;
+   else
+   return 0;

This will make parent interface to return 0 instead of its own ifindex.
I would suggest write something like that:

Agree


+   /* parent interface */
+   if (!test_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags))
+   return dev-ifindex;
+
+   /* child/vlan interface */
+   if (!priv-parent)
+   return -1;

Like was said for other drivers, I can't see how parent can be null
while IPOIB_FLAG_SUBINTERFACE is set. Drop the last if.
It can, at least for ipoib child interface (AKA vlan), you can't 
control the call for that ndo and it can be called before the parent was 
set.

Erez, you basically rewrote this, please make a proper patch with the
Fixes and Reported-By credit for Honggang. Lets merge this through
Dave M's tree right away.

Thank you all

Jason
.



--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH linux-next 1/4] infiniband/ipoib: fix possible NULL pointer dereference in ipoib_get_iflink

2015-04-14 Thread Erez Shitrit

On Tue, Apr 14, 2015 at 6:20 PM, Honggang Li ho...@redhat.com wrote:


[...]

Hi,

 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
 b/drivers/infiniband/ulp/ipoib/ipoib_main.c
 index 657b89b..11ea6e2 100644
 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
 +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
 @@ -846,7 +846,10 @@ static int ipoib_get_iflink(const struct net_device *dev)
  {
 struct ipoib_dev_priv *priv = netdev_priv(dev);

 -   return priv-parent-ifindex;
 +   if (priv  priv-parent)
 +   return priv-parent-ifindex;
 +   else
 +   return 0;
This will make parent interface to return 0 instead of its own ifindex.
I would suggest write something like that:

+   /* parent interface */
+   if (!test_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags))
+   return dev-ifindex;
+
+   /* child/vlan interface */
+   if (!priv-parent)
+   return -1;
+
return priv-parent-ifindex;

Thanks,
Erez.

  }

  static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
 --
 1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/9] IB/ipoib: factor out ah flushing

2015-03-16 Thread Erez Shitrit


On 3/15/2015 8:42 PM, Doug Ledford wrote:

On Mar 13, 2015, at 1:39 AM, Or Gerlitz gerlitz...@gmail.com wrote:

On Tue, Mar 3, 2015 at 11:59 AM, Erez Shitrit ere...@dev.mellanox.co.il wrote:

On 3/2/2015 5:09 PM, Doug Ledford wrote:

On Sun, 2015-03-01 at 08:47 +0200, Erez Shitrit wrote:

On 2/26/2015 6:27 PM, Doug Ledford wrote:

@@ -1037,6 +1038,7 @@ static void __ipoib_ib_dev_flush(struct
ipoib_dev_priv *priv,
if (level == IPOIB_FLUSH_LIGHT) {
ipoib_mark_paths_invalid(dev);
ipoib_mcast_dev_flush(dev);
+   ipoib_flush_ah(dev, 0);

Why do you need to call the flush function here?

To remove all of the ah's that were reduced to a 0 refcount by the
previous two functions prior to restarting operations.  When we remove
an ah, it calls ib_destroy_ah which calls all the way down into the low
level driver.  This was to make sure that old, stale data was removed
all the way down to the card level before we started new queries for
paths and ahs.

Yes. but it is not needed.

That depends on the card.  For the modern cards (mlx4, mlx5, qib), it
isn't needed but doesn't hurt either.  For older cards (in particular,
mthca), the driver actually frees up card resources at the time of the
call.


Can you please elaborate more here, I took a look in the mthca and didn't
see that.
anyway, what i don't understand is why you need to do that now, the ah is
already in the dead_ah_list so, in at the most 1 sec will be cleared and if
the driver goes down your other hunk fixed that.

Doug, ten days and no response from you... lets finalize the review on
this series so we have it safely done for 4.1 -- on top of it Erez
prepares a set of IPoIB fixes from our internal tree and we want that
for 4.1 too. Please address.

I didn’t have much to say here.  I said that mthca can have card resources 
freed by this call, which is backed up by this code in mthca_ah.c

int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah)
{
 switch (ah-type) {
 case MTHCA_AH_ON_HCA:
 mthca_free(dev-av_table.alloc,
(ah-avdma - dev-av_table.ddr_av_base) /
MTHCA_AV_SIZE);
 break;


I’m not entirely sure how Erez missed that, but it’s there and it’s what gets 
called when we destroy an ah (depending on the card of course).  So, that 
represents one case where freeing the resources in a non-lazy fashion has a 
direct benefit.  And there is no cited drawback to freeing the resources in a 
non-lazy fashion on a net event, so I don’t see what there is to discuss 
further on the issue.

sorry, but i still don't see the connection to the device type.
It will be deleted/freed with the regular flow, like it does in the rest 
of the life cycle cases of the ah (in neigh_dtor, path_free, etc.), so 
why here it should be directly after the event?



—
Doug Ledford dledf...@redhat.com
GPG Key ID: 0E572FDD







--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/9] IB/ipoib: factor out ah flushing

2015-03-16 Thread Erez Shitrit


On 3/16/2015 6:06 PM, Doug Ledford wrote:

On Mar 16, 2015, at 8:24 AM, Erez Shitrit ere...@dev.mellanox.co.il wrote:

On 3/15/2015 8:42 PM, Doug Ledford wrote:

Doug, ten days and no response from you... lets finalize the review on
this series so we have it safely done for 4.1 -- on top of it Erez
prepares a set of IPoIB fixes from our internal tree and we want that
for 4.1 too. Please address.

I didn’t have much to say here.  I said that mthca can have card resources 
freed by this call, which is backed up by this code in mthca_ah.c

int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah)
{
 switch (ah-type) {
 case MTHCA_AH_ON_HCA:
 mthca_free(dev-av_table.alloc,
(ah-avdma - dev-av_table.ddr_av_base) /
MTHCA_AV_SIZE);
 break;


I’m not entirely sure how Erez missed that, but it’s there and it’s what gets 
called when we destroy an ah (depending on the card of course).  So, that 
represents one case where freeing the resources in a non-lazy fashion has a 
direct benefit.  And there is no cited drawback to freeing the resources in a 
non-lazy fashion on a net event, so I don’t see what there is to discuss 
further on the issue.

sorry, but i still don't see the connection to the device type.
It will be deleted/freed with the regular flow, like it does in the rest of the 
life cycle cases of the ah (in neigh_dtor, path_free, etc.), so why here it 
should be directly after the event?

Because it’s the right thing to do.  The only reason to do lazy deletion is 
when there is a performance benefit to batching.  There is no performance 
benefit to batching here.  And because on certain hardware the action frees 
resources on the card, which are limited, doing non-lazy deletion can be 
beneficial.  Given that there is no downside to doing the deletions in a 
non-lazy fashion, and that there can be an upside depending on hardware, there 
is no reason to stick with the lazy deletions.
OK, i understand your point, not sure why it is not always with the ah 
deletion, anyway it is harmless here.



—
Doug Ledford dledf...@redhat.com
GPG Key ID: 0E572FDD







--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 8/9] IB/ipoib: deserialize multicast joins

2015-03-03 Thread Erez Shitrit


On 3/2/2015 5:29 PM, Doug Ledford wrote:

On Sun, 2015-03-01 at 15:58 +0200, Erez Shitrit wrote:

On 2/22/2015 2:27 AM, Doug Ledford wrote:

Allow the ipoib layer to attempt to join all outstanding multicast
groups at once.  The ib_sa layer will serialize multiple attempts to
join the same group, but will process attempts to join different groups
in parallel.  Take advantage of that.

In order to make this happen, change the mcast_join_thread to loop
through all needed joins, sending a join request for each one that we
still need to join.  There are a few special cases we handle though:

1) Don't attempt to join anything but the broadcast group until the join
of the broadcast group has succeeded.
2) No longer restart the join task at the end of completion handling.
If we completed successfully, we are done.  The join task now needs kicked
either by mcast_send or mcast_restart_task or mcast_start_thread, but
should not need started anytime else except when scheduling a backoff
attempt to rejoin.
3) No longer use separate join/completion routines for regular and
sendonly joins, pass them all through the same routine and just do the
right thing based on the SENDONLY join flag.
4) Only try to join a SENDONLY join twice, then drop the packets and
quit trying.  We leave the mcast group in the list so that if we get a
new packet, all that we have to do is queue up the packet and restart
the join task and it will automatically try to join twice and then
either send or flush the queue again.

Signed-off-by: Doug Ledford dledf...@redhat.com
---
   drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 250 
-
   1 file changed, 82 insertions(+), 168 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 
b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 277e7ac7c4d..c670d9c2cda 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -307,111 +307,6 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast 
*mcast,
return 0;
   }
   
-static int

-ipoib_mcast_sendonly_join_complete(int status,
-  struct ib_sa_multicast *multicast)
-{
-   struct ipoib_mcast *mcast = multicast-context;
-   struct net_device *dev = mcast-dev;
-   struct ipoib_dev_priv *priv = netdev_priv(dev);
-
-   /*
-* We have to take the mutex to force mcast_sendonly_join to
-* return from ib_sa_multicast_join and set mcast-mc to a
-* valid value.  Otherwise we were racing with ourselves in
-* that we might fail here, but get a valid return from
-* ib_sa_multicast_join after we had cleared mcast-mc here,
-* resulting in mis-matched joins and leaves and a deadlock
-*/
-   mutex_lock(mcast_mutex);
-
-   /* We trap for port events ourselves. */
-   if (status == -ENETRESET) {
-   status = 0;
-   goto out;
-   }
-
-   if (!status)
-   status = ipoib_mcast_join_finish(mcast, multicast-rec);
-
-   if (status) {
-   if (mcast-logcount++  20)
-   ipoib_dbg_mcast(netdev_priv(dev), sendonly multicast 
-   join failed for %pI6, status %d\n,
-   mcast-mcmember.mgid.raw, status);
-
-   /* Flush out any queued packets */
-   netif_tx_lock_bh(dev);
-   while (!skb_queue_empty(mcast-pkt_queue)) {
-   ++dev-stats.tx_dropped;
-   dev_kfree_skb_any(skb_dequeue(mcast-pkt_queue));
-   }
-   netif_tx_unlock_bh(dev);
-   __ipoib_mcast_schedule_join_thread(priv, mcast, 1);
-   } else {
-   mcast-backoff = 1;
-   mcast-delay_until = jiffies;
-   __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
-   }
-out:
-   clear_bit(IPOIB_MCAST_FLAG_BUSY, mcast-flags);
-   if (status)
-   mcast-mc = NULL;
-   complete(mcast-done);
-   mutex_unlock(mcast_mutex);
-   return status;
-}
-
-static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
-{
-   struct net_device *dev = mcast-dev;
-   struct ipoib_dev_priv *priv = netdev_priv(dev);
-   struct ib_sa_mcmember_rec rec = {
-#if 0  /* Some SMs don't support send-only yet */
-   .join_state = 4
-#else
-   .join_state = 1
-#endif
-   };
-   int ret = 0;
-
-   if (!test_bit(IPOIB_FLAG_OPER_UP, priv-flags)) {
-   ipoib_dbg_mcast(priv, device shutting down, no sendonly 
-   multicast joins\n);
-   clear_bit(IPOIB_MCAST_FLAG_BUSY, mcast-flags);
-   complete(mcast-done);
-   return -ENODEV;
-   }
-
-   rec.mgid = mcast-mcmember.mgid;
-   rec.port_gid = priv-local_gid;
-   rec.pkey = cpu_to_be16(priv-pkey

Re: [PATCH 7/9] IB/ipoib: fix MCAST_FLAG_BUSY usage

2015-03-03 Thread Erez Shitrit


On 3/2/2015 5:27 PM, Doug Ledford wrote:

On Sun, 2015-03-01 at 11:31 +0200, Erez Shitrit wrote:

On 2/22/2015 2:27 AM, Doug Ledford wrote:

Commit a9c8ba5884 (IPoIB: Fix usage of uninitialized multicast
objects) added a new flag MCAST_JOIN_STARTED, but was not very strict
in how it was used.  We didn't always initialize the completion struct
before we set the flag, and we didn't always call complete on the
completion struct from all paths that complete it.  And when we did
complete it, sometimes we continued to touch the mcast entry after
the completion, opening us up to possible use after free issues.

This made it less than totally effective, and certainly made its use
confusing.  And in the flush function we would use the presence of this
flag to signal that we should wait on the completion struct, but we never
cleared this flag, ever.

In order to make things clearer and aid in resolving the rtnl deadlock
bug I've been chasing, I cleaned this up a bit.

   1) Remove the MCAST_JOIN_STARTED flag entirely
   2) Change MCAST_FLAG_BUSY so it now only means a join is in-flight
   3) Test mcast-mc directly to see if we have completed
  ib_sa_join_multicast (using IS_ERR_OR_NULL)
   4) Make sure that before setting MCAST_FLAG_BUSY we always initialize
  the mcast-done completion struct
   5) Make sure that before calling complete(mcast-done), we always clear
  the MCAST_FLAG_BUSY bit
   6) Take the mcast_mutex before we call ib_sa_multicast_join and also
  take the mutex in our join callback.  This forces
  ib_sa_multicast_join to return and set mcast-mc before we process
  the callback.  This way, our callback can safely clear mcast-mc
  if there is an error on the join and we will do the right thing as
  a result in mcast_dev_flush.
   7) Because we need the mutex to synchronize mcast-mc, we can no
  longer call mcast_sendonly_join directly from mcast_send and
  instead must add sendonly join processing to the mcast_join_task
   8) Make MCAST_RUN mean that we have a working mcast subsystem, not that
  we have a running task.  We know when we need to reschedule our
  join task thread and don't need a flag to tell us.
   9) Add a helper for rescheduling the join task thread

A number of different races are resolved with these changes.  These
races existed with the old MCAST_FLAG_BUSY usage, the
MCAST_JOIN_STARTED flag was an attempt to address them, and while it
helped, a determined effort could still trip things up.

One race looks something like this:

Thread 1 Thread 2
ib_sa_join_multicast (as part of running restart mcast task)
alloc member
call callback
   ifconfig ib0 down
 wait_for_completion
  callback call completes
   wait_for_completion in
 mcast_dev_flush completes
   mcast-mc is PTR_ERR_OR_NULL
   so we skip ib_sa_leave_multicast
  return from callback
return from ib_sa_join_multicast
set mcast-mc = return from ib_sa_multicast

We now have a permanently unbalanced join/leave issue that trips up the
refcounting in core/multicast.c

Another like this:

Thread 1   Thread 2 Thread 3
ib_sa_multicast_join
  ifconfig ib0 down
priv-broadcast = NULL
 join_complete
wait_for_completion
   mcast-mc is not yet set, so don't clear
return from ib_sa_join_multicast and set mcast-mc
   complete
   return -EAGAIN (making mcast-mc invalid)
call ib_sa_multicast_leave
on invalid mcast-mc, hang
forever

By holding the mutex around ib_sa_multicast_join and taking the mutex
early in the callback, we force mcast-mc to be valid at the time we
run the callback.  This allows us to clear mcast-mc if there is an
error and the join is going to fail.  We do this before we complete
the mcast.  In this way, mcast_dev_flush always sees consistent state
in regards to mcast-mc membership at the time that the
wait_for_completion() returns.

Signed-off-by: Doug Ledford dledf...@redhat.com
---
   drivers/infiniband/ulp/ipoib/ipoib.h   |  11 +-
   drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 355 
-
   2 files changed, 238 insertions(+), 128 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
b/drivers/infiniband/ulp/ipoib/ipoib.h
index 9ef432ae72e..c79dcd5ee8a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -98,9 +98,15 @@ enum

Re: [PATCH 1/9] IB/ipoib: factor out ah flushing

2015-03-03 Thread Erez Shitrit


On 3/2/2015 5:09 PM, Doug Ledford wrote:

On Sun, 2015-03-01 at 08:47 +0200, Erez Shitrit wrote:

On 2/26/2015 6:27 PM, Doug Ledford wrote:

@@ -1037,6 +1038,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv 
*priv,
if (level == IPOIB_FLUSH_LIGHT) {
ipoib_mark_paths_invalid(dev);
ipoib_mcast_dev_flush(dev);
+   ipoib_flush_ah(dev, 0);

Why do you need to call the flush function here?

To remove all of the ah's that were reduced to a 0 refcount by the
previous two functions prior to restarting operations.  When we remove
an ah, it calls ib_destroy_ah which calls all the way down into the low
level driver.  This was to make sure that old, stale data was removed
all the way down to the card level before we started new queries for
paths and ahs.

Yes. but it is not needed.

That depends on the card.  For the modern cards (mlx4, mlx5, qib), it
isn't needed but doesn't hurt either.  For older cards (in particular,
mthca), the driver actually frees up card resources at the time of the
call.


Can you please elaborate more here, I took a look in the mthca and 
didn't see that.
anyway, what i don't understand is why you need to do that now, the ah 
is already in the dead_ah_list so, in at the most 1 sec will be cleared 
and if the driver goes down your other hunk fixed that.



The bug happened when the driver was removed (via modprobe -r etc.), and
there were ah's in the dead_ah list, that was fixed by you in the
function ipoib_ib_dev_cleanup, the call that you added here is not
relevant to the bug (and IMHO is not needed at all)

I never said that this hunk was part of the original bug I saw before.


So, the the task of cleaning the dead_ah is already there, no need to
recall it, it will be called anyway 1 sec at the most from now.

You can try that, take of that call, no harm or memory leak will happened.

I have no doubt that it will get freed later.  As I said, I never
considered this particular hunk part of that original bug.  But, as I
point out above, there is no harm in it for any hardware, and depending
on hardware it can help to make sure there isn't a shortage of
resources.  Given that fact, I see no reason to remove it.


I can't see the reason to use the flush not from the stop_ah, meaning
without setting the IPOIB_STOP_REAPER, the flush can send twice the same
work.

No, it can't.  The ah flush routine does not search through ahs to find
ones to flush.  When you delete neighbors and mcasts, they release their
references to ahs.  When the refcount goes to 0, the put routine puts
the ah on the to-be-deleted ah list.  All the flush does is take that
list and delete the items.  If you run the flush twice, the first run
deletes all the items on the to-be-deleted list, the second run sees an
empty list and does nothing.

As for using flush versus stop: the flush function cancels any delayed
ah_flush work so that it isn't racing with the normally scheduled

when you call cancel_delayed_work to work that can schedule itself, it
is not help, the work can be at the middle of the run and re-schedule
itself...

If it is in the middle of a run and reschedules itself, then it will
schedule itself at precisely the same time we would have anyway, and we
will get flushed properly, so the net result of this particular race is
that we end up doing exactly what we wanted to do anyway.


ah_flush, then flushes the workqueue to make sure anything that might
result in an ah getting freed is done, then flushes, then schedules a
new delayed flush_ah work 1 second later.  So, it does exactly what a
flush should do: it removes what there is currently to remove, and in
the case of a periodically scheduled garbage collection, schedules a new
periodic flush at the maximum interval.

It is not appropriate to call stop_ah at this point because it will
cancel the delayed work, flush the ahs, then never reschedule the
garbage collection.  If we called stop here, we would have to call start
later.  But that's not really necessary as the flush cancels the
scheduled work and reschedules it for a second later.


}

	if (level = IPOIB_FLUSH_NORMAL)

@@ -1100,6 +1102,14 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
ipoib_mcast_stop_thread(dev, 1);
ipoib_mcast_dev_flush(dev);

+	/*

+* All of our ah references aren't free until after
+* ipoib_mcast_dev_flush(), ipoib_flush_paths, and
+* the neighbor garbage collection is stopped and reaped.
+* That should all be done now, so make a final ah flush.
+*/
+   ipoib_stop_ah(dev, 1);
+
ipoib_transport_dev_cleanup(dev);
}


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord

Re: [PATCH 7/9] IB/ipoib: fix MCAST_FLAG_BUSY usage

2015-03-01 Thread Erez Shitrit


On 2/22/2015 2:27 AM, Doug Ledford wrote:

Commit a9c8ba5884 (IPoIB: Fix usage of uninitialized multicast
objects) added a new flag MCAST_JOIN_STARTED, but was not very strict
in how it was used.  We didn't always initialize the completion struct
before we set the flag, and we didn't always call complete on the
completion struct from all paths that complete it.  And when we did
complete it, sometimes we continued to touch the mcast entry after
the completion, opening us up to possible use after free issues.

This made it less than totally effective, and certainly made its use
confusing.  And in the flush function we would use the presence of this
flag to signal that we should wait on the completion struct, but we never
cleared this flag, ever.

In order to make things clearer and aid in resolving the rtnl deadlock
bug I've been chasing, I cleaned this up a bit.

  1) Remove the MCAST_JOIN_STARTED flag entirely
  2) Change MCAST_FLAG_BUSY so it now only means a join is in-flight
  3) Test mcast-mc directly to see if we have completed
 ib_sa_join_multicast (using IS_ERR_OR_NULL)
  4) Make sure that before setting MCAST_FLAG_BUSY we always initialize
 the mcast-done completion struct
  5) Make sure that before calling complete(mcast-done), we always clear
 the MCAST_FLAG_BUSY bit
  6) Take the mcast_mutex before we call ib_sa_multicast_join and also
 take the mutex in our join callback.  This forces
 ib_sa_multicast_join to return and set mcast-mc before we process
 the callback.  This way, our callback can safely clear mcast-mc
 if there is an error on the join and we will do the right thing as
 a result in mcast_dev_flush.
  7) Because we need the mutex to synchronize mcast-mc, we can no
 longer call mcast_sendonly_join directly from mcast_send and
 instead must add sendonly join processing to the mcast_join_task
  8) Make MCAST_RUN mean that we have a working mcast subsystem, not that
 we have a running task.  We know when we need to reschedule our
 join task thread and don't need a flag to tell us.
  9) Add a helper for rescheduling the join task thread

A number of different races are resolved with these changes.  These
races existed with the old MCAST_FLAG_BUSY usage, the
MCAST_JOIN_STARTED flag was an attempt to address them, and while it
helped, a determined effort could still trip things up.

One race looks something like this:

Thread 1 Thread 2
ib_sa_join_multicast (as part of running restart mcast task)
   alloc member
   call callback
  ifconfig ib0 down
 wait_for_completion
 callback call completes
  wait_for_completion in
 mcast_dev_flush completes
   mcast-mc is PTR_ERR_OR_NULL
   so we skip ib_sa_leave_multicast
 return from callback
   return from ib_sa_join_multicast
set mcast-mc = return from ib_sa_multicast

We now have a permanently unbalanced join/leave issue that trips up the
refcounting in core/multicast.c

Another like this:

Thread 1   Thread 2 Thread 3
ib_sa_multicast_join
 ifconfig ib0 down
priv-broadcast = NULL
join_complete
wait_for_completion
   mcast-mc is not yet set, so don't clear
return from ib_sa_join_multicast and set mcast-mc
   complete
   return -EAGAIN (making mcast-mc invalid)
call ib_sa_multicast_leave
on invalid mcast-mc, hang
forever

By holding the mutex around ib_sa_multicast_join and taking the mutex
early in the callback, we force mcast-mc to be valid at the time we
run the callback.  This allows us to clear mcast-mc if there is an
error and the join is going to fail.  We do this before we complete
the mcast.  In this way, mcast_dev_flush always sees consistent state
in regards to mcast-mc membership at the time that the
wait_for_completion() returns.

Signed-off-by: Doug Ledford dledf...@redhat.com
---
  drivers/infiniband/ulp/ipoib/ipoib.h   |  11 +-
  drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 355 -
  2 files changed, 238 insertions(+), 128 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
b/drivers/infiniband/ulp/ipoib/ipoib.h
index 9ef432ae72e..c79dcd5ee8a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -98,9 +98,15 @@ enum {
  
  	IPOIB_MCAST_FLAG_FOUND	  = 0,	/* used in set_multicast_list */

IPOIB_MCAST_FLAG_SENDONLY = 1,
-   IPOIB_MCAST_FLAG_BUSY = 2,  /*

Re: [PATCH 8/9] IB/ipoib: deserialize multicast joins

2015-03-01 Thread Erez Shitrit


On 2/22/2015 2:27 AM, Doug Ledford wrote:

Allow the ipoib layer to attempt to join all outstanding multicast
groups at once.  The ib_sa layer will serialize multiple attempts to
join the same group, but will process attempts to join different groups
in parallel.  Take advantage of that.

In order to make this happen, change the mcast_join_thread to loop
through all needed joins, sending a join request for each one that we
still need to join.  There are a few special cases we handle though:

1) Don't attempt to join anything but the broadcast group until the join
of the broadcast group has succeeded.
2) No longer restart the join task at the end of completion handling.
If we completed successfully, we are done.  The join task now needs kicked
either by mcast_send or mcast_restart_task or mcast_start_thread, but
should not need started anytime else except when scheduling a backoff
attempt to rejoin.
3) No longer use separate join/completion routines for regular and
sendonly joins, pass them all through the same routine and just do the
right thing based on the SENDONLY join flag.
4) Only try to join a SENDONLY join twice, then drop the packets and
quit trying.  We leave the mcast group in the list so that if we get a
new packet, all that we have to do is queue up the packet and restart
the join task and it will automatically try to join twice and then
either send or flush the queue again.

Signed-off-by: Doug Ledford dledf...@redhat.com
---
  drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 250 -
  1 file changed, 82 insertions(+), 168 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 
b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 277e7ac7c4d..c670d9c2cda 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -307,111 +307,6 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast 
*mcast,
return 0;
  }
  
-static int

-ipoib_mcast_sendonly_join_complete(int status,
-  struct ib_sa_multicast *multicast)
-{
-   struct ipoib_mcast *mcast = multicast-context;
-   struct net_device *dev = mcast-dev;
-   struct ipoib_dev_priv *priv = netdev_priv(dev);
-
-   /*
-* We have to take the mutex to force mcast_sendonly_join to
-* return from ib_sa_multicast_join and set mcast-mc to a
-* valid value.  Otherwise we were racing with ourselves in
-* that we might fail here, but get a valid return from
-* ib_sa_multicast_join after we had cleared mcast-mc here,
-* resulting in mis-matched joins and leaves and a deadlock
-*/
-   mutex_lock(mcast_mutex);
-
-   /* We trap for port events ourselves. */
-   if (status == -ENETRESET) {
-   status = 0;
-   goto out;
-   }
-
-   if (!status)
-   status = ipoib_mcast_join_finish(mcast, multicast-rec);
-
-   if (status) {
-   if (mcast-logcount++  20)
-   ipoib_dbg_mcast(netdev_priv(dev), sendonly multicast 
-   join failed for %pI6, status %d\n,
-   mcast-mcmember.mgid.raw, status);
-
-   /* Flush out any queued packets */
-   netif_tx_lock_bh(dev);
-   while (!skb_queue_empty(mcast-pkt_queue)) {
-   ++dev-stats.tx_dropped;
-   dev_kfree_skb_any(skb_dequeue(mcast-pkt_queue));
-   }
-   netif_tx_unlock_bh(dev);
-   __ipoib_mcast_schedule_join_thread(priv, mcast, 1);
-   } else {
-   mcast-backoff = 1;
-   mcast-delay_until = jiffies;
-   __ipoib_mcast_schedule_join_thread(priv, NULL, 0);
-   }
-out:
-   clear_bit(IPOIB_MCAST_FLAG_BUSY, mcast-flags);
-   if (status)
-   mcast-mc = NULL;
-   complete(mcast-done);
-   mutex_unlock(mcast_mutex);
-   return status;
-}
-
-static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
-{
-   struct net_device *dev = mcast-dev;
-   struct ipoib_dev_priv *priv = netdev_priv(dev);
-   struct ib_sa_mcmember_rec rec = {
-#if 0  /* Some SMs don't support send-only yet */
-   .join_state = 4
-#else
-   .join_state = 1
-#endif
-   };
-   int ret = 0;
-
-   if (!test_bit(IPOIB_FLAG_OPER_UP, priv-flags)) {
-   ipoib_dbg_mcast(priv, device shutting down, no sendonly 
-   multicast joins\n);
-   clear_bit(IPOIB_MCAST_FLAG_BUSY, mcast-flags);
-   complete(mcast-done);
-   return -ENODEV;
-   }
-
-   rec.mgid = mcast-mcmember.mgid;
-   rec.port_gid = priv-local_gid;
-   rec.pkey = cpu_to_be16(priv-pkey);
-
-   mutex_lock(mcast_mutex);
-   mcast-mc = ib_sa_join_multicast(ipoib_sa_client, priv-ca,
-

Re: [PATCH 1/9] IB/ipoib: factor out ah flushing

2015-02-28 Thread Erez Shitrit


On 2/26/2015 6:27 PM, Doug Ledford wrote:

On Thu, 2015-02-26 at 15:28 +0200, Erez Shitrit wrote:

On 2/22/2015 2:26 AM, Doug Ledford wrote:

Create a an ipoib_flush_ah and ipoib_stop_ah routines to use at
appropriate times to flush out all remaining ah entries before we shut
the device down.

Because neighbors and mcast entries can each have a reference on any
given ah, we must make sure to free all of those first before our ah
will actually have a 0 refcount and be able to be reaped.

This factoring is needed in preparation for having per-device work
queues.  The original per-device workqueue code resulted in the following
error message:

ibdev: ib_dealloc_pd failed

That error was tracked down to this issue.  With the changes to which
workqueues were flushed when, there were no flushes of the per device
workqueue after the last ah's were freed, resulting in an attempt to
dealloc the pd with outstanding resources still allocated.  This code
puts the explicit flushes in the needed places to avoid that problem.

Signed-off-by: Doug Ledford dledf...@redhat.com
---
   drivers/infiniband/ulp/ipoib/ipoib_ib.c | 46 
-
   1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 72626c34817..cb02466a0eb 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -659,6 +659,24 @@ void ipoib_reap_ah(struct work_struct *work)
   round_jiffies_relative(HZ));
   }
   
+static void ipoib_flush_ah(struct net_device *dev, int flush)

+{
+   struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+   cancel_delayed_work(priv-ah_reap_task);
+   if (flush)
+   flush_workqueue(ipoib_workqueue);
+   ipoib_reap_ah(priv-ah_reap_task.work);
+}
+
+static void ipoib_stop_ah(struct net_device *dev, int flush)
+{
+   struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+   set_bit(IPOIB_STOP_REAPER, priv-flags);
+   ipoib_flush_ah(dev, flush);
+}
+
   static void ipoib_ib_tx_timer_func(unsigned long ctx)
   {
drain_tx_cq((struct net_device *)ctx);
@@ -877,24 +895,7 @@ timeout:
if (ib_modify_qp(priv-qp, qp_attr, IB_QP_STATE))
ipoib_warn(priv, Failed to modify QP to RESET state\n);
   
-	/* Wait for all AHs to be reaped */

-   set_bit(IPOIB_STOP_REAPER, priv-flags);
-   cancel_delayed_work(priv-ah_reap_task);
-   if (flush)
-   flush_workqueue(ipoib_workqueue);
-
-   begin = jiffies;
-
-   while (!list_empty(priv-dead_ahs)) {
-   __ipoib_reap_ah(dev);
-
-   if (time_after(jiffies, begin + HZ)) {
-   ipoib_warn(priv, timing out; will leak address 
handles\n);
-   break;
-   }
-
-   msleep(1);
-   }
+   ipoib_flush_ah(dev, flush);
   
   	ib_req_notify_cq(priv-recv_cq, IB_CQ_NEXT_COMP);
   
@@ -1037,6 +1038,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,

if (level == IPOIB_FLUSH_LIGHT) {
ipoib_mark_paths_invalid(dev);
ipoib_mcast_dev_flush(dev);
+   ipoib_flush_ah(dev, 0);

Why do you need to call the flush function here?

To remove all of the ah's that were reduced to a 0 refcount by the
previous two functions prior to restarting operations.  When we remove
an ah, it calls ib_destroy_ah which calls all the way down into the low
level driver.  This was to make sure that old, stale data was removed
all the way down to the card level before we started new queries for
paths and ahs.


Yes. but it is not needed.
The bug happened when the driver was removed (via modprobe -r etc.), and 
there were ah's in the dead_ah list, that was fixed by you in the 
function ipoib_ib_dev_cleanup, the call that you added here is not 
relevant to the bug (and IMHO is not needed at all)
So, the the task of cleaning the dead_ah is already there, no need to 
recall it, it will be called anyway 1 sec at the most from now.


You can try that, take of that call, no harm or memory leak will happened.


I can't see the reason to use the flush not from the stop_ah, meaning
without setting the IPOIB_STOP_REAPER, the flush can send twice the same
work.

No, it can't.  The ah flush routine does not search through ahs to find
ones to flush.  When you delete neighbors and mcasts, they release their
references to ahs.  When the refcount goes to 0, the put routine puts
the ah on the to-be-deleted ah list.  All the flush does is take that
list and delete the items.  If you run the flush twice, the first run
deletes all the items on the to-be-deleted list, the second run sees an
empty list and does nothing.

As for using flush versus stop: the flush function cancels any delayed
ah_flush work so that it isn't racing with the normally scheduled


when you call cancel_delayed_work to work that can schedule itself

Re: [PATCH 1/9] IB/ipoib: factor out ah flushing

2015-02-26 Thread Erez Shitrit


On 2/22/2015 2:26 AM, Doug Ledford wrote:

Create a an ipoib_flush_ah and ipoib_stop_ah routines to use at
appropriate times to flush out all remaining ah entries before we shut
the device down.

Because neighbors and mcast entries can each have a reference on any
given ah, we must make sure to free all of those first before our ah
will actually have a 0 refcount and be able to be reaped.

This factoring is needed in preparation for having per-device work
queues.  The original per-device workqueue code resulted in the following
error message:

ibdev: ib_dealloc_pd failed

That error was tracked down to this issue.  With the changes to which
workqueues were flushed when, there were no flushes of the per device
workqueue after the last ah's were freed, resulting in an attempt to
dealloc the pd with outstanding resources still allocated.  This code
puts the explicit flushes in the needed places to avoid that problem.

Signed-off-by: Doug Ledford dledf...@redhat.com
---
  drivers/infiniband/ulp/ipoib/ipoib_ib.c | 46 -
  1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 72626c34817..cb02466a0eb 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -659,6 +659,24 @@ void ipoib_reap_ah(struct work_struct *work)
   round_jiffies_relative(HZ));
  }
  
+static void ipoib_flush_ah(struct net_device *dev, int flush)

+{
+   struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+   cancel_delayed_work(priv-ah_reap_task);
+   if (flush)
+   flush_workqueue(ipoib_workqueue);
+   ipoib_reap_ah(priv-ah_reap_task.work);
+}
+
+static void ipoib_stop_ah(struct net_device *dev, int flush)
+{
+   struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+   set_bit(IPOIB_STOP_REAPER, priv-flags);
+   ipoib_flush_ah(dev, flush);
+}
+
  static void ipoib_ib_tx_timer_func(unsigned long ctx)
  {
drain_tx_cq((struct net_device *)ctx);
@@ -877,24 +895,7 @@ timeout:
if (ib_modify_qp(priv-qp, qp_attr, IB_QP_STATE))
ipoib_warn(priv, Failed to modify QP to RESET state\n);
  
-	/* Wait for all AHs to be reaped */

-   set_bit(IPOIB_STOP_REAPER, priv-flags);
-   cancel_delayed_work(priv-ah_reap_task);
-   if (flush)
-   flush_workqueue(ipoib_workqueue);
-
-   begin = jiffies;
-
-   while (!list_empty(priv-dead_ahs)) {
-   __ipoib_reap_ah(dev);
-
-   if (time_after(jiffies, begin + HZ)) {
-   ipoib_warn(priv, timing out; will leak address 
handles\n);
-   break;
-   }
-
-   msleep(1);
-   }
+   ipoib_flush_ah(dev, flush);
  
  	ib_req_notify_cq(priv-recv_cq, IB_CQ_NEXT_COMP);
  
@@ -1037,6 +1038,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,

if (level == IPOIB_FLUSH_LIGHT) {
ipoib_mark_paths_invalid(dev);
ipoib_mcast_dev_flush(dev);
+   ipoib_flush_ah(dev, 0);


Why do you need to call the flush function here?
I can't see the reason to use the flush not from the stop_ah, meaning 
without setting the IPOIB_STOP_REAPER, the flush can send twice the same 
work.



}
  
  	if (level = IPOIB_FLUSH_NORMAL)

@@ -1100,6 +1102,14 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
ipoib_mcast_stop_thread(dev, 1);
ipoib_mcast_dev_flush(dev);
  
+	/*

+* All of our ah references aren't free until after
+* ipoib_mcast_dev_flush(), ipoib_flush_paths, and
+* the neighbor garbage collection is stopped and reaped.
+* That should all be done now, so make a final ah flush.
+*/
+   ipoib_stop_ah(dev, 1);
+
ipoib_transport_dev_cleanup(dev);
  }
  


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 09/22] IB/ipoib: fix IPOIB_MCAST_RUN flag usage

2015-02-13 Thread Erez Shitrit


On 2/12/2015 3:43 AM, Doug Ledford wrote:

The usage of IPOIB_MCAST_RUN as a flag is inconsistent.  In some places
it is used to mean our device is administratively allowed to send
multicast joins/leaves/packets and in other places it means our
multicast join task thread is currently running and will process your
request if you put it on the queue.  However, this latter meaning is in
fact flawed as there is a race condition between the join task testing
the mcast list and finding it empty of remaining work, dropping the
mcast mutex and also the priv-lock spinlock, and clearing the
IPOIB_MCAST_RUN flag.  Further, there are numerous locations that use
the flag in the former fashion, and when all tasks complete and the task
thread clears the RUN flag, all of those other locations will fail to
ever again queue any work.  This results in the interface coming up fine
initially, but having problems adding new multicast groups after the
first round of groups have all been added and the RUN flag is cleared by
the join task thread when it thinks it is done.  To resolve this issue,
convert all locations in the code to treat the RUN flag as an indicator
that the multicast portion of this interface is in fact administratively
up and joins/leaves/sends can be performed.  There is no harm (other
than a slight performance penalty) to never clearing this flag and using
it in this fashion as it simply means that a few places that used to
micro-optimize how often this task was queued on a work queue will now
queue the task a few extra times.  We can address that suboptimal
behavior in future patches.

Signed-off-by: Doug Ledford dledf...@redhat.com
---
  drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 11 +--
  1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 
b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index bc50dd0d0e4..91b8fe118ec 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -630,8 +630,6 @@ void ipoib_mcast_join_task(struct work_struct *work)
}
  
  	ipoib_dbg_mcast(priv, successfully joined all multicast groups\n);

-
-   clear_bit(IPOIB_MCAST_RUN, priv-flags);
  }
  
  int ipoib_mcast_start_thread(struct net_device *dev)

@@ -641,8 +639,8 @@ int ipoib_mcast_start_thread(struct net_device *dev)
ipoib_dbg_mcast(priv, starting multicast thread\n);
  
  	mutex_lock(mcast_mutex);

-   if (!test_and_set_bit(IPOIB_MCAST_RUN, priv-flags))
-   queue_delayed_work(priv-wq, priv-mcast_task, 0);
+   set_bit(IPOIB_MCAST_RUN, priv-flags);


Hi Doug,
Can you use IPOIB_FLAG_OPER_UP instead of IPOIB_MCAST_RUN, in all these 
places and get rid of that RUN bit?



+   queue_delayed_work(priv-wq, priv-mcast_task, 0);
mutex_unlock(mcast_mutex);
  
  	return 0;

@@ -725,7 +723,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, 
struct sk_buff *skb)
memcpy(mcast-mcmember.mgid.raw, mgid, sizeof (union ib_gid));
__ipoib_mcast_add(dev, mcast);
list_add_tail(mcast-list, priv-multicast_list);
-   if (!test_and_set_bit(IPOIB_MCAST_RUN, priv-flags))
+   if (test_bit(IPOIB_MCAST_RUN, priv-flags))
queue_delayed_work(priv-wq, priv-mcast_task, 0);
}
  
@@ -951,7 +949,8 @@ void ipoib_mcast_restart_task(struct work_struct *work)

/*
 * Restart our join task if needed
 */
-   ipoib_mcast_start_thread(dev);
+   if (test_bit(IPOIB_MCAST_RUN, priv-flags))
+   queue_delayed_work(priv-wq, priv-mcast_task, 0);
rtnl_unlock();
  }
  


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic

2015-01-27 Thread Erez Shitrit


On 1/27/2015 12:00 AM, Doug Ledford wrote:

On Mon, 2015-01-26 at 22:57 +0200, Or Gerlitz wrote:

On Mon, Jan 26, 2015 at 9:38 PM, Doug Ledford dledf...@redhat.com wrote:

On Mon, 2015-01-26 at 15:16 +0200, Or Gerlitz wrote:

On Mon, Jan 26, 2015 at 3:00 PM, Erez Shitrit ere...@mellanox.com wrote:

Following commit 016d9fb25cd9 IPoIB: fix MCAST_FLAG_BUSY usage both
IPv6 traffic and for the most cases all IPv4 multicast traffic aren't
working.


Hi Doug + Roland

Erez was very patiently reviewing and testing all the six (V0...V5)
patch series you sent to fix the 3.19-rc1 regression.

Yes he has.



  Can you also give this patch a try?

I can test it.  But I need to know how it's supposed to be applied.

just apply it on latest upstream and run whatever tests you have, simple.

I used the same base kernel that I used for my patchset.


It might fix the regression, it might also reintroduce a race on
ifup/ifdown.  I'll test and see.

Let's see it in action @ your env

It passed the initial IPv6 after a failed join issue that my own
patchset just finally passes.

However, I didn't get more than 5 minutes into testing before I was able
to livelock the system.  In this case, from machine A running my
patchset, I did

ping6 -I mlx4_ib0 -i .25 machine B address

On machine B running Erez's patch, I did:

rmmod ib_ipoib; modprobe ib_ipoib mcast_debug_level=1; sleep 2; ping6
-i .25 -c 10 -I mlx4_ib0 machine A address

And on the machine rdma-master, where the opensm runs, I did just a few:

systemctl restart opensm

The livelock is in the mcast flushing code.  On the machine that
livelocked, here's the dmesg tail:

[  423.189514] mlx4_ib0.8002: multicast join failed for 
ff12:401b:8002:::::, status -110
[  423.189541] mlx4_ib0.8002: deleting multicast group 
ff12:401b:8002:::::0001
[  423.189545] mlx4_ib0.8002: deleting multicast group 
ff12:601b:8002:::::0001
[  423.189547] mlx4_ib0.8002: deleting multicast group 
ff12:601b:8002:::0001:ff7b:e1b1
[  423.189549] mlx4_ib0.8002: deleting multicast group 
ff12:401b:8002:::::00fb
[  423.189551] mlx4_ib0.8002: deleting multicast group 
ff12:401b:8002:::::
[  423.204570] mlx4_ib0.8002: stopping multicast thread
[  423.204573] mlx4_ib0.8002: flushing multicast list
[  423.213567] mlx4_ib0: stopping multicast thread
[  423.213571] mlx4_ib0: flushing multicast list

The rmmod operation is stuck in ib_sa_unregister_client (one of the
specific fixes my patchset resolves BTW).


The patch I sent, only claims to fix the regression in multicast and 
sendonly issues, it can replace the first 3 patches and the one you sent 
me off list from your last patchset.
(probably there are more bugs, that the rest of your patchset solved, 
hence it should be tested with the rest of your patchset)


And probably there are more bugs that your patcheset didn't fix yet -:)
For example, I run your last patchset+ the last fix you sent me with:
modprobe -r ib_ipoib; modprobe  ib_ipoib;
ping6
some adding/deleting  one child interface and got the next panic:

[81209.348259] ib0: join completion for 
ff12:601b::::0001:ff43:3bf1 (status -102)
[81209.408787] BUG: unable to handle kernel NULL pointer dereference at 
0020
[81209.416750] IP: [a096b399] ipoib_mcast_join+0xa9/0x1b0 
[ib_ipoib]

[81209.423787] PGD 0
[81209.425864] Oops:  [#1] SMP
[81209.429165] Modules linked in: ib_ipoib(E) ib_cm mlx4_ib ib_sa ib_mad 
ib_core ib_addr mlx4_core netconsole configfs nfsv3 nfs_acl 
rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs fscache lockd grace 
ip6table_filter ip6_tables ebtable_nat ebtables nf_conntrack_ipv4 
nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT nf_reject_ipv4 
xt_CHECKSUM iptable_mangle iptable_filter ip_tables autofs4 sunrpc 
bridge stp llc ipv6 dm_mirror dm_region_hash dm_log dm_mod vhost_net 
macvtap macvlan vhost tun kvm_intel kvm iTCO_wdt iTCO_vendor_support 
dcdbas microcode pcspkr serio_raw wmi sg lpc_ich mfd_core i7core_edac 
edac_core bnx2 ext3(E) jbd(E) mbcache(E) sr_mod(E) cdrom(E) sd_mod(E) 
pata_acpi(E) ata_generic(E) ata_piix(E) megaraid_sas(E) [last unloaded: 
ib_cm]
[81209.495358] CPU: 10 PID: 7655 Comm: kworker/u64:0 Tainted: 
GE  3.18.0+ #1
[81209.503297] Hardware name: Dell Inc. PowerEdge R710/0MD99X, BIOS 
6.4.0 07/23/2013

[81209.510905] Workqueue: ipoib_wq ipoib_mcast_join_task [ib_ipoib]
[81209.516975] task: 88041bb64050 ti: 88041b968000 task.ti: 
88041b968000
[81209.524566] RIP: 0010:[a096b399] [a096b399] 
ipoib_mcast_join+0xa9/0x1b0 [ib_ipoib]

[81209.534079] RSP: 0018:88041b96bcf8  EFLAGS: 00010202
[81209.539447] RAX:  RBX: 8803bd92e8c0 RCX: 

[81209.546636] RDX: 88082fcaea38 RSI: 88082fcad238 RDI: 
88082fcad238
[81209.553833] RBP: 88041b96bd78 R08:  R09: 
60f0
[81209.561027] R10:  R11: 0001 R12

Re: [PATCH FIX For-3.19 v5 00/10] Fix ipoib regressions

2015-01-26 Thread Erez Shitrit


On 1/26/2015 2:51 PM, Doug Ledford wrote:

On Mon, 2015-01-26 at 12:27 +0200, Erez Shitrit wrote:


New (and full) dmesg attached, (after modprobe ib_ipoib, with all debug
flags set) it is all there.

Thank you, I know what's going on here now.  Will correct shortly.


welcome -:)




The main cause is the concept that was broken for the send-only join,
when you treat the sendonly like a regular mcg and add it to the mc list
and to the mc_task etc.

I'm looking at 3.18 right now, and 3.18 adds sendonly groups to the mcg
just like my current code does.  The only difference, and I do mean
*only*, is that it calls sendonly_join directly instead of via the
mcast_task.

Yes, and i already wrote that it is more than just only, it changed
the concept of the sendonly mc packet.

Be more specific please.  What do you mean by concept?  And just so we
are clear, this all started because the existing multicast code was
super easy to break and was racy, so if the concept you are referring
to is what made the original code easy to break and racy, I'm not going
to care one whit that I changed that concept.



I agree that you fixed many bugs in your patches to 3.18, where the mc 
flow was easy to break, no argue about that.
The only issue that i disagree is about the way now sendonly is handled 
(and i think that this is the reason for the regression we see now).
In general, IMHO, the sendonly join is part of the TX flow and not part 
of the  ipoib_set_mcast_list flow.
The original meaning of the ipoib_set_mcast_list task that restart the 
mc_task is to be used for the kernel in order to add one or more new 
mcg's macs to the driver/HW (ndo_set_rx_mode), the sendonly mc is not 
such object, its mac should not be part of the mac list of the driver 
(in IB wards, no qp_attach for it) and from the kernel point of view 
whenever it sends packet from sendonly mcg type no need to do the join, 
it's a regular send, the only reason we have the sendonly join is the IB 
enforcement for such mcg.
The reason the driver keeps the sendonly mcg in its mc_list is from 
others reasons, the first is to handle the case when the kernel decides 
to move a mcg from sendonly membership to full-member, one more other 
reason is to do the leave operation when needed and not for being 
handled as a full-member mcg.



--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH FIX For-3.19 v5 00/10] Fix ipoib regressions

2015-01-26 Thread Erez Shitrit


On 1/26/2015 3:37 PM, Doug Ledford wrote:

On Mon, 2015-01-26 at 15:24 +0200, Erez Shitrit wrote:

The main cause is the concept that was broken for the send-only join,
when you treat the sendonly like a regular mcg and add it to the mc list
and to the mc_task etc.

I'm looking at 3.18 right now, and 3.18 adds sendonly groups to the mcg
just like my current code does.  The only difference, and I do mean
*only*, is that it calls sendonly_join directly instead of via the
mcast_task.

Yes, and i already wrote that it is more than just only, it changed
the concept of the sendonly mc packet.

Be more specific please.  What do you mean by concept?  And just so we
are clear, this all started because the existing multicast code was
super easy to break and was racy, so if the concept you are referring
to is what made the original code easy to break and racy, I'm not going
to care one whit that I changed that concept.


I agree that you fixed many bugs in your patches to 3.18, where the mc
flow was easy to break, no argue about that.
The only issue that i disagree is about the way now sendonly is handled
(and i think that this is the reason for the regression we see now).

It's not.  The patch I sent you off list should be sufficient at this
point to finally put your issues to rest.


In general, IMHO, the sendonly join is part of the TX flow and not part
of the  ipoib_set_mcast_list flow.

I disagree.  I'll get into that below.


The original meaning of the ipoib_set_mcast_list task that restart the
mc_task is to be used for the kernel in order to add one or more new
mcg's macs to the driver/HW (ndo_set_rx_mode), the sendonly mc is not
such object, its mac should not be part of the mac list of the driver
(in IB wards, no qp_attach for it)

This part I agree with.


  and from the kernel point of view
whenever it sends packet from sendonly mcg type no need to do the join,
it's a regular send, the only reason we have the sendonly join is the IB
enforcement for such mcg.

This is where you and I differ.  There is a requirement on us from the
IB spec that we have to join a sendonly MC group in order to do what the
kernel would think of as a normal send if we were on Ethernet.  We
aren't on Ethernet though, so acting like we are to the above layer is
fine, but in our own layer we should be coding to the realities of our
layer.  And that means treating a sendonly MC group like a regular MC
group.


Just to make it clear, the IB request to do sendonly join was done prior 
to your changes. (stopped after them.)
The IB spec doesn't care if you mix sendonly with full-member, the 
kernel does care, and by adding them to the mc_list you abuse the kernel 
ndo (by adding unnecessary joins to it, as i wrote in my previous mail)
I think you should not mix between RX operation (like the full-member 
join, kernel rx ndo) to TX operation (sendonly join)





The reason the driver keeps the sendonly mcg in its mc_list is from
others reasons, the first is to handle the case when the kernel decides
to move a mcg from sendonly membership to full-member, one more other
reason is to do the leave operation when needed and not for being
handled as a full-member mcg.

It's been a long time since I first started working on this issue, so
some of the details are fuzzy in my memory.  I think my first 8 patch
patchset is now about 6 months old.  But, I think probably the most
important thing you left off this list, and the one that trumps all the
others, is that whether a sendonly MC group has a QP attached or not, we
still have to account for these groups, we have to track them, and on
shutdown we have to reliably leave them without oopsing.  Putting the
join of the sendonly groups directly in mcast_send opens us up to
problems with synchronizing our mcast list (either due to a dev flush, a
mcast restart task, or something else).


This is the reason the sendonly mcg was added to the mc_list. and should 
stay there.



   I can't say for certain that
the change to how we record the return value from ib_sa_join_multicast
in sendonly_join is enough to keep the sendonly join code from racing
with the flush code and causing problems.  For that reason, I am highly
reluctant to put the sendonly join back directly into the tx path
because that could have been part of the original problem in the first
place, it's just that my memory of back then is too fuzzy to say that
with 100% certainty either way.


That fear from the unknown is not a reason to break the original logic 
of the sendonly.

if there is such a problem, we need to find it and to fix.


Thanks, Erez





--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V3 FIX for-3.19] IB/ipoib: Fix sendonly traffic and multicast traffic

2015-01-26 Thread Erez Shitrit


Following commit 016d9fb25cd9 IPoIB: fix MCAST_FLAG_BUSY usage both
IPv6 traffic and for the most cases all IPv4 multicast traffic aren't
working.

After this change there is no mechanism to handle the work that does the
join process for the rest of the mcg's. For example, if in the list of
all the mcg's there is a send-only request, after its processing, the
code in ipoib_mcast_sendonly_join_complete() will not requeue the
mcast task, but leaves the bit that signals this task is running,
and hence the task will never run.

Also, whenever the kernel sends multicast packet (w.o joining to this
group), we don't call ipoib_send_only_join(), the code tries to start
the mcast task but it failed because the bit IPOIB_MCAST_RUN is always
set, As a result the multicast packet will never be sent.

The fix handles all the join requests via the same logic, and call
explicitly to sendonly join whenever there is a packet from sendonly
type.

Since ipoib_mcast_sendonly_join() is now called from the driver TX flow, we
can't take mutex there. We avoid this locking by using temporary pointer when
calling ib_sa_join_multicast and testing that pointer object to be valid (not
error), if it is an error the driver knows that the completion will not be
called, and the driver can set the mcast-mc value.


Fixes: 016d9fb25cd9 ('IPoIB: fix MCAST_FLAG_BUSY usage')
Reported-by: Eyal Perry eya...@mellanox.com
Signed-off-by: Erez Shitrit ere...@mellanox.com
Signed-off-by: Or Gerlitz ogerl...@mellanox.com

---
Changes from V1:
1. always do clear_bit(IPOIB_MCAST_FLAG_BUSY) in
ipoib_mcast_sendonly_join_complete()
2. Sync between ipoib_mcast_sendonly_join() to
ipoib_mcast_sendonly_join_complete using
an IS_ERR_OR_NULL(mcast-mc) test

Changes from V2:
1. ipoib_mcast_sendonly_join will not  abuse mcast-mc pointer.

 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   38 
 1 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 
b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index bc50dd0..bbb4d3d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -306,6 +306,8 @@ out:
clear_bit(IPOIB_MCAST_FLAG_BUSY, mcast-flags);
if (status)
mcast-mc = NULL;
+   else
+   mcast-mc = multicast;
complete(mcast-done);
if (status == -ENETRESET)
status = 0;
@@ -317,6 +319,7 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast 
*mcast)
 {
struct net_device *dev = mcast-dev;
struct ipoib_dev_priv *priv = netdev_priv(dev);
+   struct ib_sa_multicast   *mc_ret;
struct ib_sa_mcmember_rec rec = {
 #if 0  /* Some SMs don't support send-only yet */
.join_state = 4
@@ -342,20 +345,20 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast 
*mcast)
rec.port_gid = priv-local_gid;
rec.pkey = cpu_to_be16(priv-pkey);
 
-   mutex_lock(mcast_mutex);
init_completion(mcast-done);
set_bit(IPOIB_MCAST_FLAG_BUSY, mcast-flags);
-   mcast-mc = ib_sa_join_multicast(ipoib_sa_client, priv-ca,
-priv-port, rec,
-IB_SA_MCMEMBER_REC_MGID|
-IB_SA_MCMEMBER_REC_PORT_GID|
-IB_SA_MCMEMBER_REC_PKEY|
-IB_SA_MCMEMBER_REC_JOIN_STATE,
-GFP_ATOMIC,
-ipoib_mcast_sendonly_join_complete,
-mcast);
-   if (IS_ERR(mcast-mc)) {
-   ret = PTR_ERR(mcast-mc);
+   mc_ret = ib_sa_join_multicast(ipoib_sa_client, priv-ca,
+ priv-port, rec,
+ IB_SA_MCMEMBER_REC_MGID   |
+ IB_SA_MCMEMBER_REC_PORT_GID   |
+ IB_SA_MCMEMBER_REC_PKEY   |
+ IB_SA_MCMEMBER_REC_JOIN_STATE,
+ GFP_ATOMIC,
+ ipoib_mcast_sendonly_join_complete,
+ mcast);
+   if (IS_ERR(mc_ret)) {
+   mcast-mc = mc_ret;
+   ret = PTR_ERR(mc_ret);
clear_bit(IPOIB_MCAST_FLAG_BUSY, mcast-flags);
complete(mcast-done);
ipoib_warn(priv, ib_sa_join_multicast for sendonly join 
@@ -364,7 +367,6 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast 
*mcast)
ipoib_dbg_mcast(priv, no multicast record for %pI6, starting 
sendonly join\n, mcast-mcmember.mgid.raw);
}
-   mutex_unlock(mcast_mutex);
 
return ret;
 }
@@ -622,10 +624,8 @@ void

Re: [PATCH FIX For-3.19 v5 00/10] Fix ipoib regressions

2015-01-26 Thread Erez Shitrit


On 1/26/2015 12:21 AM, Doug Ledford wrote:

On Sun, 2015-01-25 at 14:54 +0200, Erez Shitrit wrote:

On 1/23/2015 6:52 PM, Doug Ledford wrote:

On Thu, 2015-01-22 at 09:31 -0500, Doug Ledford wrote:

My 8 patch set taken into 3.19 caused some regressions.  This patch
set resolves those issues.

These patches are to resolve issues created by my previous patch set.
While that set worked fine in my testing, there were problems with
multicast joins after the initial set of joins had completed.  Since my
testing relied upon the normal set of multicast joins that happen
when the interface is first brought up, I missed those problems.

Symptoms vary from failure to send packets due to a failed join, to
loss of connectivity after a subnet manager restart, to failure
to properly release multicast groups on shutdown resulting in hangs
when the mlx4 driver attempts to unload itself via its reboot
notifier handler.

This set of patches has passed a number of tests above and beyond my
original tests.  As suggested by Or Gerlitz I added IPv6 and IPv4
multicast tests.  I also added both subnet manager restarts and
manual shutdown/restart of individual ports at the switch in order to
ensure that the ENETRESET path was properly tested.  I included
testing, then a subnet manager restart, then a quiescent period for
caches to expire, then restarting testing to make sure that arp and
neighbor discovery work after the subnet manager restart.

All in all, I have not been able to trip the multicast joins up any
longer.

Additionally, the original impetus for my first 8 patch set was that
it was simply too easy to break the IPoIB subsystem with this simple
loop:

while true; do
  ifconfig ib0 up
  ifconfig ib0 down
done

Just to be safe, I made sure this problem did not resurface.

v5: fix an oversight in mcast_restart_task that leaked mcast joins
  fix a failure to flush the ipoib_workqueue on deregister that
  meant we could end up running our code after our device had been
  removed, resulting in an oops
  remove a debug message that could be trigger so fast that the
  kernel printk mechanism would starve out the mcast join task thread
  resulting in what looked like a mcast failure that was really just
  delayed action


Doug Ledford (10):
IB/ipoib: fix IPOIB_MCAST_RUN flag usage
IB/ipoib: Add a helper to restart the multicast task
IB/ipoib: make delayed tasks not hold up everything
IB/ipoib: Handle -ENETRESET properly in our callback
IB/ipoib: don't restart our thread on ENETRESET
IB/ipoib: remove unneeded locks
IB/ipoib: fix race between mcast_dev_flush and mcast_join
IB/ipoib: fix ipoib_mcast_restart_task
IB/ipoib: flush the ipoib_workqueue on unregister
IB/ipoib: cleanup a couple debug messages

   drivers/infiniband/ulp/ipoib/ipoib.h   |   1 +
   drivers/infiniband/ulp/ipoib/ipoib_main.c  |   2 +
   drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 234 
++---
   3 files changed, 131 insertions(+), 106 deletions(-)


FWIW, a couple different customers have tried a test kernel I built
internally with my patches and I've had multiple reports that all
previously observed issues have been resolved.


Hi Doug,
still see an issue with the last version, and as a result no sendonly or
IPv6 is working.

I disagree with your assessment.  See below.


The scenario is some how simple to reproduce, if there is a sendonly
multicast group that failed to join (sm refuses, perhaps the group was
closed, etc.)

This is not the case.  failed to join implies that the join completed
with an error.  According to the logs below, something impossible is
happening.


  that causes all the other mcg's to be blocked behind it
forever.

Your right.  Because the join tasks are serialized and we only queue up
one join at a time (something I would like to address, but not in the
3.19 timeframe), if one join simply never returns, as the logs below
indicate, then it blocks up the whole system.


for example, there is bad mcg ff12:601b::::::0016
that the sm refuses to join, and after some time the user tries to send
packets to ip address 225.5.5.5 (mcg:
ff12:401b:::::0105:0505 )
the log will show something like:
[1561627.426080] ib0: no multicast record for
ff12:601b::::::0016, starting sendonly join

This is printed out by sendonly_join


[1561633.726768] ib0: setting up send only multicast group for
ff12:401b:::::0105:0505

This is part of mcast_send and indicates we have created the new group
and added it to the mcast rbtree, but not that we've started the join.


[1561643.498990] ib0: no multicast record for
ff12:601b::::::0016, starting sendonly join

Our mcast task got ran again, and we are restarting the same group for
some reason.  Theoretically that means we should have cleared the busy
flag on this group somehow, which requires

Re: [PATCH V2 FIX for-3.19] IB/ipoib: Fix broken multicast flow

2015-01-25 Thread Erez Shitrit


On 1/22/2015 10:40 PM, Doug Ledford wrote:

On Thu, 2015-01-22 at 15:31 +0200, Or Gerlitz wrote:

From: Erez Shitrit ere...@mellanox.com

Following commit 016d9fb25cd9 IPoIB: fix MCAST_FLAG_BUSY usage both
IPv6 traffic and for the most cases all IPv4 multicast traffic aren't
working.

After this change there is no mechanism to handle the work that does the
join process for the rest of the mcg's. For example, if in the list of
all the mcg's there is a send-only request, after its processing, the
code in ipoib_mcast_sendonly_join_complete() will not requeue the
mcast task, but leaves the bit that signals this task is running,
and hence the task will never run.

Also, whenever the kernel sends multicast packet (w.o joining to this
group), we don't call ipoib_send_only_join(), the code tries to start
the mcast task but it failed because the bit IPOIB_MCAST_RUN is always
set, As a result the multicast packet will never be sent.

The fix handles all the join requests via the same logic, and call
explicitly to sendonly join whenever there is a packet from sendonly type.

Since ipoib_mcast_sendonly_join() is now called from the driver TX flow,
we can't take mutex there. We avoid locking by testing the multicast
object to be valid (not error or null).

Fixes: 016d9fb25cd9 ('IPoIB: fix MCAST_FLAG_BUSY usage')
Reported-by: Eyal Perry eya...@mellanox.com
Signed-off-by: Erez Shitrit ere...@mellanox.com
Signed-off-by: Or Gerlitz ogerl...@mellanox.com
---

Changes from V1:

1. always do clear_bit(IPOIB_MCAST_FLAG_BUSY) 
inipoib_mcast_sendonly_join_complete()

This part is good.


2. Sync between ipoib_mcast_sendonly_join() to 
ipoib_mcast_sendonly_join_complete
using a IS_ERR_OR_NULL() test

This part is no good.  You just added a kernel data corrupter or kernel
oopser depending on the situation.


  drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   16 ++--
  1 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 
b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index bc50dd0..212cfb4 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -342,7 +342,6 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast 
*mcast)
rec.port_gid = priv-local_gid;
rec.pkey = cpu_to_be16(priv-pkey);
  
-	mutex_lock(mcast_mutex);

init_completion(mcast-done);
set_bit(IPOIB_MCAST_FLAG_BUSY, mcast-flags);
mcast-mc = ib_sa_join_multicast(ipoib_sa_client, priv-ca,
@@ -354,8 +353,8 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast 
*mcast)
 GFP_ATOMIC,
 ipoib_mcast_sendonly_join_complete,
 mcast);
-   if (IS_ERR(mcast-mc)) {
-   ret = PTR_ERR(mcast-mc);
+   if (IS_ERR_OR_NULL(mcast-mc)) {
+   ret = mcast-mc ? PTR_ERR(mcast-mc) : -EAGAIN;
clear_bit(IPOIB_MCAST_FLAG_BUSY, mcast-flags);
complete(mcast-done);
ipoib_warn(priv, ib_sa_join_multicast for sendonly join 
@@ -364,7 +363,6 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast 
*mcast)
ipoib_dbg_mcast(priv, no multicast record for %pI6, starting 
sendonly join\n, mcast-mcmember.mgid.raw);
}
-   mutex_unlock(mcast_mutex);
  
  	return ret;

  }

These three hunks allow the join completion routine to run parallel to
ib_sa_join_multicast returning.  It is a race condition to see who
finishes first.  However, what's not much of a race condition is that if
the join completion finishes first, and it set mcast-mc = NULL;, then
it also ran clear_bit(IPOIB_MCAST_FLAG_BUSY, mcast_flags); and
complete(mcast-done);.  We will now enter into an if statement and run
those same things again.  If, at the time the first complete was run, we
were already waiting in mcast_dev_flush or mcast_restart_task for the
completion to happen, it is entirely possible, and maybe even probable,
that one of those routines would have already proceeded to free our
mcast struct out from underneath us.  By the time we get around to
running the clear_bit and complete in this routine, it is entirely
possible that our memory has been freed and reused and we are causing
random data corruption.  Or it's been freed and during reuse it was
cleared and now when we try to dereference those pointers we oops the
kernel.

The alternative, if you really want to remove that lock, is that we
can't abuse mcast-mc to store the return value.  Instead, we would need
to use a temporary pointer, and check that temporary pointer for
ERR_PTR.  If we got that, then we know we will never run our completion
routine and we should complete ourselves.  If we get anything else, then
it might be valid, or it might get nullified by our completion routine,
so we simply throw the data away and let the completion

Re: [PATCH FIX For-3.19 v5 00/10] Fix ipoib regressions

2015-01-25 Thread Erez Shitrit


On 1/23/2015 6:52 PM, Doug Ledford wrote:

On Thu, 2015-01-22 at 09:31 -0500, Doug Ledford wrote:

My 8 patch set taken into 3.19 caused some regressions.  This patch
set resolves those issues.

These patches are to resolve issues created by my previous patch set.
While that set worked fine in my testing, there were problems with
multicast joins after the initial set of joins had completed.  Since my
testing relied upon the normal set of multicast joins that happen
when the interface is first brought up, I missed those problems.

Symptoms vary from failure to send packets due to a failed join, to
loss of connectivity after a subnet manager restart, to failure
to properly release multicast groups on shutdown resulting in hangs
when the mlx4 driver attempts to unload itself via its reboot
notifier handler.

This set of patches has passed a number of tests above and beyond my
original tests.  As suggested by Or Gerlitz I added IPv6 and IPv4
multicast tests.  I also added both subnet manager restarts and
manual shutdown/restart of individual ports at the switch in order to
ensure that the ENETRESET path was properly tested.  I included
testing, then a subnet manager restart, then a quiescent period for
caches to expire, then restarting testing to make sure that arp and
neighbor discovery work after the subnet manager restart.

All in all, I have not been able to trip the multicast joins up any
longer.

Additionally, the original impetus for my first 8 patch set was that
it was simply too easy to break the IPoIB subsystem with this simple
loop:

while true; do
 ifconfig ib0 up
 ifconfig ib0 down
done

Just to be safe, I made sure this problem did not resurface.

v5: fix an oversight in mcast_restart_task that leaked mcast joins
 fix a failure to flush the ipoib_workqueue on deregister that
 meant we could end up running our code after our device had been
 removed, resulting in an oops
 remove a debug message that could be trigger so fast that the
 kernel printk mechanism would starve out the mcast join task thread
 resulting in what looked like a mcast failure that was really just
 delayed action


Doug Ledford (10):
   IB/ipoib: fix IPOIB_MCAST_RUN flag usage
   IB/ipoib: Add a helper to restart the multicast task
   IB/ipoib: make delayed tasks not hold up everything
   IB/ipoib: Handle -ENETRESET properly in our callback
   IB/ipoib: don't restart our thread on ENETRESET
   IB/ipoib: remove unneeded locks
   IB/ipoib: fix race between mcast_dev_flush and mcast_join
   IB/ipoib: fix ipoib_mcast_restart_task
   IB/ipoib: flush the ipoib_workqueue on unregister
   IB/ipoib: cleanup a couple debug messages

  drivers/infiniband/ulp/ipoib/ipoib.h   |   1 +
  drivers/infiniband/ulp/ipoib/ipoib_main.c  |   2 +
  drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 234 ++---
  3 files changed, 131 insertions(+), 106 deletions(-)


FWIW, a couple different customers have tried a test kernel I built
internally with my patches and I've had multiple reports that all
previously observed issues have been resolved.



Hi Doug,
still see an issue with the last version, and as a result no sendonly or 
IPv6 is working.
The scenario is some how simple to reproduce, if there is a sendonly 
multicast group that failed to join (sm refuses, perhaps the group was 
closed, etc.) that causes all the other mcg's to be blocked behind it 
forever.
for example, there is bad mcg ff12:601b::::::0016 
that the sm refuses to join, and after some time the user tries to send 
packets to ip address 225.5.5.5 (mcg: 
ff12:401b:::::0105:0505 )

the log will show something like:
[1561627.426080] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[1561633.726768] ib0: setting up send only multicast group for 
ff12:401b:::::0105:0505
[1561643.498990] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[1561675.645424] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[1561691.718464] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[1561707.791609] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[1561723.864839] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[1561739.937981] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[1561756.010895] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join



for ever or till the sm will decide in the future to let 
ff12:601b::::::0016 join, till than no sendonly traffic.



The main cause is the concept that was broken for the send-only join, 
when you treat the sendonly like a regular mcg and add it to the mc list 
and to

Re: [PATCH V1 FIX for-3.19] IB/ipoib: Fix broken multicast flow

2015-01-22 Thread Erez Shitrit


On 1/13/2015 8:07 PM, Doug Ledford wrote:

On Wed, 2015-01-07 at 17:04 +0200, Or Gerlitz wrote:

From: Erez Shitrit ere...@mellanox.com

Following commit 016d9fb25cd9 IPoIB: fix MCAST_FLAG_BUSY usage
both IPv6 traffic and for the most cases all IPv4 multicast traffic
aren't working.

After this change there is no mechanism to handle the work that does the
join process for the rest of the mcg's. For example, if in the list of
all the mcg's there is a send-only request, after its processing, the
code in ipoib_mcast_sendonly_join_complete() will not requeue the
mcast task, but leaves the bit that signals this task is running,
and hence the task will never run.

Also, whenever the kernel sends multicast packet (w.o joining to this
group), we don't call ipoib_send_only_join(), the code tries to start
the mcast task but it failed because the bit IPOIB_MCAST_RUN is always
set, As a result the multicast packet will never be sent.

The fix handles all the join requests via the same logic, and call
explicitly to sendonly join whenever there is a packet from sendonly type.

Since ipoib_mcast_sendonly_join() is now called from the driver TX flow,
we can't take mutex there. Locking isn't required there since the multicast
join callback will be called only after the SA agent initialized the relevant
multicast object.

Fixes: 016d9fb25cd9 ('IPoIB: fix MCAST_FLAG_BUSY usage')
Reported-by: Eyal Perry eya...@mellanox.com
Signed-off-by: Erez Shitrit ere...@mellanox.com
Signed-off-by: Or Gerlitz ogerl...@mellanox.com
---
V0 -- V1 changes: Added credits (...) and furnished the change-log abit.

  drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   15 ++-
  1 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 
b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index bc50dd0..0ea4b08 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -301,9 +301,10 @@ ipoib_mcast_sendonly_join_complete(int status,
dev_kfree_skb_any(skb_dequeue(mcast-pkt_queue));
}
netif_tx_unlock_bh(dev);
+
+   clear_bit(IPOIB_MCAST_FLAG_BUSY, mcast-flags);
}
  out:
-   clear_bit(IPOIB_MCAST_FLAG_BUSY, mcast-flags);
if (status)
mcast-mc = NULL;
complete(mcast-done);

This chunk is wrong.  We are in our complete routine, which means
ib_sa_join_multicast is calling us for this mcast group, and we will
never see another return for this group.  We must clear the BUSY flag no
matter what as the BUSY flag now indicates that our mcast join is still
outstanding in the lower layer ib_sa_ area, not that we have joined the
group.  Please re-read my patches that re-worked the BUSY flag usage.
The BUSY flag was poorly named/used in the past, which is why a previous
patch introduced the JOINING or whatever flag it was called.  My
patchset reworks the flag usage to be more sane.  BUSY now means
*exactly* that: this mcast group is in the process of joining, aka it's
BUSY.  It doesn't mean we've joined the group and there are no more
outstanding join requests.  That's signified by mcast-mc !=
IS_ERR_OR_NULL.
agree, may mistake, the clear_bit(IPOIB_MCAST_FLAG_BUSY) should be in 
any case, i will move it 3 lines later,

BTW, this will solve the in-flight messages you saw.



@@ -342,7 +343,6 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast 
*mcast)
rec.port_gid = priv-local_gid;
rec.pkey = cpu_to_be16(priv-pkey);
  
-	mutex_lock(mcast_mutex);

init_completion(mcast-done);
set_bit(IPOIB_MCAST_FLAG_BUSY, mcast-flags);
mcast-mc = ib_sa_join_multicast(ipoib_sa_client, priv-ca,
@@ -364,7 +364,6 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast 
*mcast)
ipoib_dbg_mcast(priv, no multicast record for %pI6, starting 
sendonly join\n, mcast-mcmember.mgid.raw);
}
-   mutex_unlock(mcast_mutex);
  
  	return ret;

  }

No!  You can not, under any circumstances, remove this locking!  One of
the things that frustrated me for a bit until I tracked it down was how
ib_sa_join_multicast returns errors to the ipoib layer.  When you call
ib_sa_join_multicast, the return value is either a valid mcast-mc
pointer or IS_ERR(err).  If it's a valid pointer, that does not mean we
have successfully joined, it means that we might join, but it isn't
until we have completed the callback that we know.  The callback will
clear out mcast-mc if we encounter an error during the callback and
know that by returning an error from the callback, the lower layer is
going to delete the mcast-mc context out from underneath us.  As it
turns out, we often get our callbacks called even before we get the
initial return from ib_sa_join_multicast.  If we don't have this
locking, and we get any error in the callback, the callback will clear
mcast-mc to indicate that we

Re: [PATCH V3 FIX For-3.19 0/3] IB/ipoib: Fix multicast join flow

2015-01-15 Thread Erez Shitrit


On 1/15/2015 5:24 PM, Doug Ledford wrote:

On Thu, 2015-01-15 at 09:19 +, Erez Shitrit wrote:

Hi Doug,

Thank you for the quick response.

Now I can see 2 issues, that I want to draw your attention to:

1. if there is a mcg that the driver failed to join, the mc_task enters to 
endless loop of re-queue, and the log will be full with the next messages:
[682560.569826] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.580136] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.590364] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.600504] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.610627] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.620769] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.631082] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.640835] ib0: sendonly multicast join failed for 
ff12:601b::::::0016, status -22
[682560.651033] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.660758] ib0: sendonly multicast join failed for 
ff12:601b::::::0016, status -22
[682560.670923] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.680676] ib0: sendonly multicast join failed for 
ff12:601b::::::0016, status -22
[682560.690898] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.700630] ib0: sendonly multicast join failed for 
ff12:601b::::::0016, status -22

around 100 times a sec.

OK, this looks like the send only joins that fail are not setting a
fallback properly or something like that.  There is a separate bug that
I've isolated that I'm going to fix, then I we can see if that fix
effects things here, as it very well might.


2. IPv6 still doesn't work for me, at the same case where it is not the first 
mcg in the list.

Can you give me some sort of instructions on how to replicate your
testing?  Things are working for me here, but I don't have a complex
IPv6 setup and mine may be too simple to reproduce what you are seeing.
I don't have a complex setup, i have 2 devices, and i do a regular ping6 
from device with the full series in it, to some other device. nothing 
special, the only thing i can say that in the list there is one sendonly 
mcg (


ff12:601b::::::0016) that is at the first place in the list.
anyway, i think it connected to the first issue,because it at some endless loop 
with the first mcg, it doesn't have the chance to handle the other mcg's.




Thanks, Erez

-Original Message-
From: Doug Ledford [mailto:dledf...@redhat.com]
Sent: Wednesday, January 14, 2015 9:53 PM
To: linux-rdma@vger.kernel.org; rol...@kernel.org
Cc: Amir Vadai; Eyal Perry; Erez Shitrit; Or Gerlitz; Doug Ledford
Subject: [PATCH V3 FIX For-3.19 0/3] IB/ipoib: Fix multicast join flow

This patch series fixes the multicast join behavior problems introduced by my 
previous patchset.  In particular, the original code did not use the send only 
join code from the multicast thread context, and so it did not need to restart 
the multicast thread.  After my previous patchset, it does get called from the 
thread context, and so the send only join completion areas need to restart the 
join thread but they don't.  This patchset makes them do so.  It then adds in 
some cleanups for restarting the thread, and fixes the fact that one delayed 
join holds up the entire list of joins.

v3: Resend because the last send didn't register in patchworks properly
 (because the subject-prefix was not on all of the emails, only the
 first) and because the Cc: list didn't not pass from cover letter
 to patches

v2: Added two new patches, the first creates a helper to restart the
 multicast join thread and also adds using it in the two places where
 it should have been used but wasn't, the second allows the joins to
 proceed around a delayed join instead of stalling everything.

v1: Addressed the usage of the IPOIB_MCAST_RUN flag

Doug Ledford (3):
   IB/ipoib: Fix failed multicast joins/sends
   IB/ipoib: Add a helper to restart the multicast task
   IB/ipoib: make delayed tasks not hold up everything

  drivers/infiniband/ulp/ipoib/ipoib.h   |  1 +
  drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 94 ++
  2 files changed, 66 insertions(+), 29 deletions(-)

--
2.1.0

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo

RE: [PATCH V3 FIX For-3.19 0/3] IB/ipoib: Fix multicast join flow

2015-01-15 Thread Erez Shitrit

Hi Doug,

Thank you for the quick response.

Now I can see 2 issues, that I want to draw your attention to:

1. if there is a mcg that the driver failed to join, the mc_task enters to 
endless loop of re-queue, and the log will be full with the next messages:
[682560.569826] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.580136] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.590364] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.600504] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.610627] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.620769] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.631082] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.640835] ib0: sendonly multicast join failed for 
ff12:601b::::::0016, status -22
[682560.651033] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.660758] ib0: sendonly multicast join failed for 
ff12:601b::::::0016, status -22
[682560.670923] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.680676] ib0: sendonly multicast join failed for 
ff12:601b::::::0016, status -22
[682560.690898] ib0: no multicast record for 
ff12:601b::::::0016, starting sendonly join
[682560.700630] ib0: sendonly multicast join failed for 
ff12:601b::::::0016, status -22

around 100 times a sec.

2. IPv6 still doesn't work for me, at the same case where it is not the first 
mcg in the list.

Thanks, Erez

-Original Message-
From: Doug Ledford [mailto:dledf...@redhat.com] 
Sent: Wednesday, January 14, 2015 9:53 PM
To: linux-rdma@vger.kernel.org; rol...@kernel.org
Cc: Amir Vadai; Eyal Perry; Erez Shitrit; Or Gerlitz; Doug Ledford
Subject: [PATCH V3 FIX For-3.19 0/3] IB/ipoib: Fix multicast join flow

This patch series fixes the multicast join behavior problems introduced by my 
previous patchset.  In particular, the original code did not use the send only 
join code from the multicast thread context, and so it did not need to restart 
the multicast thread.  After my previous patchset, it does get called from the 
thread context, and so the send only join completion areas need to restart the 
join thread but they don't.  This patchset makes them do so.  It then adds in 
some cleanups for restarting the thread, and fixes the fact that one delayed 
join holds up the entire list of joins.

v3: Resend because the last send didn't register in patchworks properly
(because the subject-prefix was not on all of the emails, only the
first) and because the Cc: list didn't not pass from cover letter
to patches

v2: Added two new patches, the first creates a helper to restart the
multicast join thread and also adds using it in the two places where
it should have been used but wasn't, the second allows the joins to
proceed around a delayed join instead of stalling everything.

v1: Addressed the usage of the IPOIB_MCAST_RUN flag

Doug Ledford (3):
  IB/ipoib: Fix failed multicast joins/sends
  IB/ipoib: Add a helper to restart the multicast task
  IB/ipoib: make delayed tasks not hold up everything

 drivers/infiniband/ulp/ipoib/ipoib.h   |  1 +
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 94 ++
 2 files changed, 66 insertions(+), 29 deletions(-)

--
2.1.0

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH FIX for-3.19] IB/ipoib: Fix failed multicast joins/sends

2015-01-14 Thread Erez Shitrit


On 1/14/2015 6:09 PM, Doug Ledford wrote:

On Wed, 2015-01-14 at 18:02 +0200, Erez Shitrit wrote:

Hi Doug,

Perhaps I am missing something here, but ping6 still doesn't work for me
in many cases.

I think the reason is that your origin patch does the following:
in function ipoib_mcast_join_task
  if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, mcast-flags))
  ipoib_mcast_sendonly_join(mcast);
  else
  ipoib_mcast_join(dev, mcast, 1);
  return;
The flow for sendonly_join doesn't include handling the mc_task, so only
the first mc in the list (if it is sendonly mcg) will be sent, and no
more mcg's that are in the ipoib mc list are going to be sent. (see how
it is in ipoib_mcast_join flow)

Yes, I know what you are talking about.  However, my patches did not add
this bug, it was present in the original code.  Please check a plain
v3.18 kernel, which does not have my patches, and you will see that
ipoib_mcast_sendonly_join_complete also fails to restart the mcast join
thread there as well.

Agree.
but in 3.18 there was no call from mc_task to sendonly_join, just to the 
full-member join, so no need at that point to handle the task. (the call 
for sendonly-join was by demand whenever new packet to mcg was sent by 
the kernel)

only in 3.19 the sendonly join was called explicitly from the mc_task.



I can demonstrate it with the log of ipoib:
I am trying to ping6 fe80::202:c903:9f:3b0a via ib0

The log is:
ib0: restarting multicast task
ib0: setting up send only multicast group for
ff12:601b::::::0016
ib0: adding multicast entry for mgid ff12:601b::::0001:ff43:3bf1
ib0: no multicast record for ff12:601b::::::0016,
starting sendonly join
ib0: join completion for ff12:601b::::::0001 (status 0)
ib0: MGID ff12:601b::::::0001 AV 88081afb5f40,
LID 0xc015, SL 0
ib0: join completion for ff12:401b::::::0001 (status 0)
ib0: MGID ff12:401b::::::0001 AV 88081e1c42c0,
LID 0xc014, SL 0
ib0: sendonly multicast join failed for
ff12:601b::::::0016, status -22
ib0: no multicast record for ff12:601b::::::0016,
starting sendonly join
ib0: sendonly multicast join failed for
ff12:601b::::::0016, status -22
ib0: no multicast record for ff12:601b::::::0016,
starting sendonly join
ib0: sendonly multicast join failed for
ff12:601b::::::0016, status -22
ib0: setting up send only multicast group for
ff12:601b::::::0002
ib0: no multicast record for ff12:601b::::::0016,
starting sendonly join
ib0: sendonly multicast join failed for
ff12:601b::::::0016, status -22
ib0: setting up send only multicast group for
ff12:601b::::0001:ff9f:3b0a
   here you can see that the ipv6 address is added and queued
to the list
ib0: no multicast record for ff12:601b::::::0016,
starting sendonly join
ib0: sendonly multicast join failed for
ff12:601b::::::0016, status -22
   the ipv6 mcg will not be sent because it is after some other
sendonly, and no one in that flow re-queue the mc_task again.

This is a problem with the design of the original mcast task thread.
I'm looking at a fix now.  Currently the design only allows one join to
be outstanding at a time.  Is there a reason for that that I'm not aware
of?  Some historical context that I don't know about?
IMHO, the reason for  only one mc on the air at a time was to make our 
life easier, otherwise there are locks to take/manage, races between few 
responses, etc. also, the multicast module in the core keeps all the 
requests in serialize mode.
perhaps, you can use the relevant code from the full-member join in the 
sendonly joinin order to handle the mc_task, or to return the call to 
send-only to the mcast_send instead of the mc_task.






--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH FIX for-3.19] IB/ipoib: Fix failed multicast joins/sends

2015-01-14 Thread Erez Shitrit


Hi Doug,

Perhaps I am missing something here, but ping6 still doesn't work for me 
in many cases.


I think the reason is that your origin patch does the following:
in function ipoib_mcast_join_task
if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, mcast-flags))
ipoib_mcast_sendonly_join(mcast);
else
ipoib_mcast_join(dev, mcast, 1);
return;
The flow for sendonly_join doesn't include handling the mc_task, so only 
the first mc in the list (if it is sendonly mcg) will be sent, and no 
more mcg's that are in the ipoib mc list are going to be sent. (see how 
it is in ipoib_mcast_join flow)


I can demonstrate it with the log of ipoib:
I am trying to ping6 fe80::202:c903:9f:3b0a via ib0

The log is:
ib0: restarting multicast task
ib0: setting up send only multicast group for 
ff12:601b::::::0016

ib0: adding multicast entry for mgid ff12:601b::::0001:ff43:3bf1
ib0: no multicast record for ff12:601b::::::0016, 
starting sendonly join

ib0: join completion for ff12:601b::::::0001 (status 0)
ib0: MGID ff12:601b::::::0001 AV 88081afb5f40, 
LID 0xc015, SL 0

ib0: join completion for ff12:401b::::::0001 (status 0)
ib0: MGID ff12:401b::::::0001 AV 88081e1c42c0, 
LID 0xc014, SL 0
ib0: sendonly multicast join failed for 
ff12:601b::::::0016, status -22
ib0: no multicast record for ff12:601b::::::0016, 
starting sendonly join
ib0: sendonly multicast join failed for 
ff12:601b::::::0016, status -22
ib0: no multicast record for ff12:601b::::::0016, 
starting sendonly join
ib0: sendonly multicast join failed for 
ff12:601b::::::0016, status -22
ib0: setting up send only multicast group for 
ff12:601b::::::0002
ib0: no multicast record for ff12:601b::::::0016, 
starting sendonly join
ib0: sendonly multicast join failed for 
ff12:601b::::::0016, status -22
ib0: setting up send only multicast group for 
ff12:601b::::0001:ff9f:3b0a
 here you can see that the ipv6 address is added and queued 
to the list
ib0: no multicast record for ff12:601b::::::0016, 
starting sendonly join
ib0: sendonly multicast join failed for 
ff12:601b::::::0016, status -22
 the ipv6 mcg will not be sent because it is after some other 
sendonly, and no one in that flow re-queue the mc_task again.


Thanks, Erez

The usage of IPOIB_MCAST_RUN as a flag is inconsistent.  In some places
it is used to mean our device is administratively allowed to send
multicast joins/leaves/packets and in other places it means our
multicast join task thread is currently running and will process your
request if you put it on the queue.  However, this latter meaning is in
fact flawed as there is a race condition between the join task testing
the mcast list and finding it empty of remaining work, dropping the
mcast mutex and also the priv-lock spinlock, and clearing the
IPOIB_MCAST_RUN flag.  Further, there are numerous locations that use
the flag in the former fashion, and when all tasks complete and the task
thread clears the RUN flag, all of those other locations will fail to
ever again queue any work.  This results in the interface coming up fine
initially, but having problems adding new multicast groups after the
first round of groups have all been added and the RUN flag is cleared by
the join task thread when it thinks it is done.  To resolve this issue,
convert all locations in the code to treat the RUN flag as an indicator
that the multicast portion of this interface is in fact administratively
up and joins/leaves/sends can be performed.  There is no harm (other
than a slight performance penalty) to never clearing this flag and using
it in this fashion as it simply means that a few places that used to
micro-optimize how often this task was queued on a work queue will now
queue the task a few extra times.  We can address that suboptimal
behavior in future patches.

Signed-off-by: Doug Ledford dledf...@redhat.com
---
  drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 11 +--
  1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 
b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index bc50dd0d0e4..91b8fe118ec 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -630,8 +630,6 @@ void ipoib_mcast_join_task(struct work_struct *work)
}
  
  	ipoib_dbg_mcast(priv, successfully joined all multicast groups\n);

-
-   clear_bit(IPOIB_MCAST_RUN, priv-flags);
  }
  
  int ipoib_mcast_start_thread(struct net_device *dev)

@@ -641,8 +639,8 @@ int ipoib_mcast_start_thread(struct net_device *dev)
ipoib_dbg_mcast(priv, starting multicast thread\n);

Re: [PATCH 6/8] IPoIB: Use dedicated workqueues per interface

2014-09-10 Thread Erez Shitrit




On 09/04/2014 02:49 AM, Erez Shitrit wrote:

Hi Doug,

The idea that big ipoib workqueue is only for the IB_EVENT_XXX looks
good to me.

one comment here: I don't see where the driver flushes it at all.
IMHO, when the driver goes down you should cancel all pending tasks over
that global wq prior the call for the destroy_workqueue.


I understand your thought, but I disagree with your conclusion. If we 
ever get to the portion of the driver down sequence where we are 
removing the big flush work queue and there are still items on that 
work queue, then we have already failed and we are going to crash 
because there are outstanding flushes for a deleted device.
it is not a rare case, sm events (pkey, hand-over, etc.) while driver 
restart on devices in the cluster, and you are there.
removing one device of few on the same host, it is a rare condition, 
anyway i agree that cancel_work is not the solution.
The real issue here is that we need to make sure it's not possible to 
delete a device that has outstanding flushes, and not possible to 
flush a device in the process of being deleted (Wendy, don't get mad 
at me, but the rtnl lock jumps out as appropriate for this purpose).
we can remove the device and drain the workqueue while checking on each 
event that this event doesn't belong to deleted device (we have the list 
of all existing priv objects in the system via ib_get_client_data) after 
that we can be sure that no more works are for that deleted device.



The current logic calls the destroy_workqueue after the remove_one, i
think you should cancel the pending works after the
ib_unregister_event_handler call in the ipoib_remove_one function,


The ib_remove_one is device specific while the big flush work queue is 
not.  As such, that can cancel work for devices other than the device 
being removed with no clear means of getting it back.



otherwise if there are pending tasks in that queue they will schedule
after the dev/priv are gone.


I agree that there is a problem here.  I think what needs to happen is 
that now that we have a work queue per device, and all flushes come in 
to us via a work queue that does not belong to the device, it is now 
possible to handle flushes synchronously.  This would allow us to lock 
around the flush itself and prevent removal until after the flush is 
complete without getting into the hairy rat hole that is trying to 
flush the flush workqueue that might result in either flushing or 
canceling the very device we are working on.





--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/8] IPoIB: Make the carrier_on_task race aware

2014-09-04 Thread Erez Shitrit


We blindly assume that we can just take the rtnl lock and that will
prevent races with downing this interface.  Unfortunately, that's not
the case.  In ipoib_mcast_stop_thread() we will call flush_workqueue()
in an attempt to clear out all remaining instances of ipoib_join_task.
But, since this task is put on the same workqueue as the join task, the
flush_workqueue waits on this thread too.  But this thread is deadlocked
on the rtnl lock.  The better thing here is to use trylock and loop on
that until we either get the lock or we see that FLAG_ADMIN_UP has
been cleared, in which case we don't need to do anything anyway and we
just return.

Signed-off-by: Doug Ledford dledf...@redhat.com
---
  drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 21 +++--
  1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c 
b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index a0a42859f12..7e9cd39b5ef 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -353,18 +353,27 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work)
   carrier_on_task);
struct ib_port_attr attr;
  
-	/*

-* Take rtnl_lock to avoid racing with ipoib_stop() and
-* turning the carrier back on while a device is being
-* removed.
-*/
if (ib_query_port(priv-ca, priv-port, attr) ||
attr.state != IB_PORT_ACTIVE) {
ipoib_dbg(priv, Keeping carrier off until IB port is 
active\n);
return;
}
  
-	rtnl_lock();

+   /*
+* Take rtnl_lock to avoid racing with ipoib_stop() and
+* turning the carrier back on while a device is being
+* removed.  However, ipoib_stop() will attempt to flush
+* the workqueue while holding the rtnl lock, so loop
+* on trylock until either we get the lock or we see
+* FLAG_ADMIN_UP go away as that signals that we are bailing
+* and can safely ignore the carrier on work
+*/
+   while (!rtnl_trylock()) {
+   if (!test_bit(IPOIB_FLAG_ADMIN_UP, priv-flags))
+   return;


I think that this check shouldn't be connected to the rtnl_lock, if 
IPOIB_FLAG_ADMIN_UP is clear no need to take the carrier on, just wait 
to the next loop when ipoib_open will be called.



+   else
+   msleep(20);
+   }
if (!ipoib_cm_admin_enabled(priv-dev))
dev_set_mtu(priv-dev, min(priv-mcast_mtu, priv-admin_mtu));
netif_carrier_on(priv-dev);


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 5/8] IPoIB: change init sequence ordering

2014-09-04 Thread Erez Shitrit




In preparation for using per device work queues, we need to move the
start of the neighbor thread task to after ipoib_ib_dev_init and move
the destruction of the neighbor task to before ipoib_ib_dev_cleanup.
Otherwise we will end up freeing our workqueue with work possibly still
on it.

Signed-off-by: Doug Ledford dledf...@redhat.com


Acked-by: Erez Shitrit ere...@mellanox.com


---
  drivers/infiniband/ulp/ipoib/ipoib_main.c | 24 +---
  1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 217cb77157d..949948a46d4 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1262,15 +1262,13 @@ int ipoib_dev_init(struct net_device *dev, struct 
ib_device *ca, int port)
  {
struct ipoib_dev_priv *priv = netdev_priv(dev);
  
-	if (ipoib_neigh_hash_init(priv)  0)

-   goto out;
/* Allocate RX/TX rings to hold queued skbs */
priv-rx_ring =  kzalloc(ipoib_recvq_size * sizeof *priv-rx_ring,
GFP_KERNEL);
if (!priv-rx_ring) {
printk(KERN_WARNING %s: failed to allocate RX ring (%d 
entries)\n,
   ca-name, ipoib_recvq_size);
-   goto out_neigh_hash_cleanup;
+   goto out;
}
  
  	priv-tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv-tx_ring);

@@ -1285,16 +1283,24 @@ int ipoib_dev_init(struct net_device *dev, struct 
ib_device *ca, int port)
if (ipoib_ib_dev_init(dev, ca, port))
goto out_tx_ring_cleanup;
  
+	/*

+* Must be after ipoib_ib_dev_init so we can allocate a per
+* device wq there and use it here
+*/
+   if (ipoib_neigh_hash_init(priv)  0)
+   goto out_dev_uninit;
+
return 0;
  
+out_dev_uninit:

+   ipoib_ib_dev_cleanup();
+
  out_tx_ring_cleanup:
vfree(priv-tx_ring);
  
  out_rx_ring_cleanup:

kfree(priv-rx_ring);
  
-out_neigh_hash_cleanup:

-   ipoib_neigh_hash_uninit(dev);
  out:
return -ENOMEM;
  }
@@ -1317,6 +1323,12 @@ void ipoib_dev_cleanup(struct net_device *dev)
}
unregister_netdevice_many(head);
  
+	/*

+* Must be before ipoib_ib_dev_cleanup or we delete an in use
+* work queue
+*/
+   ipoib_neigh_hash_uninit(dev);
+
ipoib_ib_dev_cleanup(dev);
  
  	kfree(priv-rx_ring);

@@ -1324,8 +1336,6 @@ void ipoib_dev_cleanup(struct net_device *dev)
  
  	priv-rx_ring = NULL;

priv-tx_ring = NULL;
-
-   ipoib_neigh_hash_uninit(dev);
  }
  
  static const struct header_ops ipoib_header_ops = {


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 1/1] IPoIB: Avoid multicast join attempts when having invalid p_key

2014-07-02 Thread Erez Shitrit


Hi Alex,

Still there is an issue here, please try the following:

1. pkey table contains the pkey 8001
2. echo 0x8001  /sys/class/net/ib0/create_child ; ifconfig ib0.8001 
1.1.1.1 up - till now all good.

3. change the sm partiotion file, take out the pkey value 8001
4. force the sm to send the new event (PKEY_CHANGE_EVENT) via pkill -HUP 
opensm

5. now return back the pkey 8001 to the partiotion file
6. again, force the sm to send the new event (PKEY_CHANGE_EVENT) via 
pkill -HUP opensm


The new interface ib0.8001 remains down and its carrier is 0.
please check that,

Thanks, Erez


Currently, the parent interface keeps sending broadcast group join
requests even if p_key index 0 is invalid, which for itself is
possible/common in virtualized environment where a VF has been probed to
VM but the actual p_key configuration has not yet been assigned by the
management software. This creates unnecessary noise on the fabric and in
the kernel logs:

ib0: multicast join failed for ff12:401b:8000:::::,
status -22

The original code run the multicast task regardless of the actual
p_key value, which can be avoided. The fix is to re-init resources  and
bring interface up only if p_key index 0 is valid either when starting
up or on PKEY_CHANGE event.

Fixes: c290414169 ('IPoIB: Fix pkey change flow for virtualization 
environments')

Reviewed-by: Ira Weiny ira.we...@intel.com
Signed-off-by: Alex Estrin alex.est...@intel.com
---
Changes from v4:
- streamline child interface pkey event handling,
- shutdown of pkey polling thread depends on PKEY_STOP flag state only.
Original poib_ib_dev_down() could leave polling thread active
if PKEY_ASSIGNED flag was set. That could create a racing condition on followed
re-initialization of QP resources.
---
  drivers/infiniband/ulp/ipoib/ipoib_ib.c |   59 +++
  1 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 6a7003d..ac941e1 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -52,6 +52,7 @@ MODULE_PARM_DESC(data_debug_level,
  #endif
  
  static DEFINE_MUTEX(pkey_mutex);

+static void ipoib_pkey_dev_check_presence(struct net_device *dev);
  
  struct ipoib_ah *ipoib_create_ah(struct net_device *dev,

 struct ib_pd *pd, struct ib_ah_attr *attr)
@@ -669,12 +670,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret;
  
-	if (ib_find_pkey(priv-ca, priv-port, priv-pkey, priv-pkey_index)) {

-   ipoib_warn(priv, P_Key 0x%04x not found\n, priv-pkey);
-   clear_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
+   ipoib_pkey_dev_check_presence(dev);
+
+   if (!test_bit(IPOIB_PKEY_ASSIGNED, priv-flags)) {
+   ipoib_warn(priv, P_Key 0x%04x is %s\n, priv-pkey,
+  !(priv-pkey  0x7fff) ? Invalid : not found);
return -1;
}
-   set_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
  
  	ret = ipoib_init_qp(dev);

if (ret) {
@@ -712,9 +714,10 @@ dev_stop:
  static void ipoib_pkey_dev_check_presence(struct net_device *dev)
  {
struct ipoib_dev_priv *priv = netdev_priv(dev);
-   u16 pkey_index = 0;
  
-	if (ib_find_pkey(priv-ca, priv-port, priv-pkey, pkey_index))

+   if (!(priv-pkey  0x7fff) ||
+   ib_find_pkey(priv-ca, priv-port, priv-pkey,
+priv-pkey_index))
clear_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
else
set_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
@@ -746,12 +749,10 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush)
netif_carrier_off(dev);
  
  	/* Shutdown the P_Key thread if still active */

-   if (!test_bit(IPOIB_PKEY_ASSIGNED, priv-flags)) {
-   mutex_lock(pkey_mutex);
-   set_bit(IPOIB_PKEY_STOP, priv-flags);
+   mutex_lock(pkey_mutex);
+   if (!test_and_set_bit(IPOIB_PKEY_STOP, priv-flags))
cancel_delayed_work_sync(priv-pkey_poll_task);
-   mutex_unlock(pkey_mutex);
-   }
+   mutex_unlock(pkey_mutex);
  
  	ipoib_mcast_stop_thread(dev, flush);

ipoib_mcast_dev_flush(dev);
@@ -972,7 +973,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv 
*priv,
  {
struct ipoib_dev_priv *cpriv;
struct net_device *dev = priv-dev;
-   u16 new_index;
+   u16 old_index;
int result;
  
  	down_read(priv-vlan_rwsem);

@@ -986,16 +987,17 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv 
*priv,
  
  	up_read(priv-vlan_rwsem);
  
-	if (!test_bit(IPOIB_FLAG_INITIALIZED, priv-flags)) {

-   /* for non-child devices must check/update the pkey value here 
*/
-   if (level == IPOIB_FLUSH_HEAVY 
-   !test_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags))
-

Re: [PATCH v4 1/1] IPoIB: Avoid multicast join attempts when having invalid p_key

2014-06-22 Thread Erez Shitrit


Hi Alex,
i played with your patch over 3.16-rc1 and got a strange behaviour (i 
think there is a problem with the patch, it doesn't happened without the 
patch).


please try the next scenario:
1. pkey table contains only the default pkey
2. echo 0x8001  /sys/class/net/ib0/create_child ; ifconfig ib0.8001 
1.1.1.1 up

3. change the sm partiotion file to include the new pkey value 8001
4. force the sm to send the new event (PKEY_CHANGE_EVENT) via pkill -HUP 
opensm


The new interface ib0.8001 remains down and its carrier is 0.
without the patch the carrier is 1

please check that,

Thanks, Erez

6/17/2014 6:06 PM, Alex Estrin:

Currently, the parent interface keeps sending broadcast group join
requests even if p_key index 0 is invalid, which for itself is
possible/common in virtualized environment where a VF has been probed to
VM but the actual p_key configuration has not yet been assigned by the
management software. This creates unnecessary noise on the fabric and in
the kernel logs:

ib0: multicast join failed for ff12:401b:8000:::::,
status -22

The original code run the multicast task regardless of the actual
p_key value, which can be avoided. The fix is to re-init resources  and
bring interface up only if p_key index 0 is valid either when starting
up or on PKEY_CHANGE event.

Fixes: c290414169 ('IPoIB: Fix pkey change flow for virtualization 
environments')

Reviewed-by: Ira Weiny ira.we...@intel.com
Signed-off-by: Alex Estrin alex.est...@intel.com
---
  drivers/infiniband/ulp/ipoib/ipoib_ib.c |   35 +++
  1 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 6a7003d..952db0b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -52,6 +52,7 @@ MODULE_PARM_DESC(data_debug_level,
  #endif
  
  static DEFINE_MUTEX(pkey_mutex);

+static void ipoib_pkey_dev_check_presence(struct net_device *dev);
  
  struct ipoib_ah *ipoib_create_ah(struct net_device *dev,

 struct ib_pd *pd, struct ib_ah_attr *attr)
@@ -669,12 +670,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret;
  
-	if (ib_find_pkey(priv-ca, priv-port, priv-pkey, priv-pkey_index)) {

-   ipoib_warn(priv, P_Key 0x%04x not found\n, priv-pkey);
-   clear_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
+   ipoib_pkey_dev_check_presence(dev);
+
+   if (!test_bit(IPOIB_PKEY_ASSIGNED, priv-flags)) {
+   ipoib_warn(priv, P_Key 0x%04x is %s\n, priv-pkey,
+  !(priv-pkey  0x7fff) ? Invalid : not found);
return -1;
}
-   set_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
  
  	ret = ipoib_init_qp(dev);

if (ret) {
@@ -712,9 +714,10 @@ dev_stop:
  static void ipoib_pkey_dev_check_presence(struct net_device *dev)
  {
struct ipoib_dev_priv *priv = netdev_priv(dev);
-   u16 pkey_index = 0;
  
-	if (ib_find_pkey(priv-ca, priv-port, priv-pkey, pkey_index))

+   if (!(priv-pkey  0x7fff) ||
+   ib_find_pkey(priv-ca, priv-port, priv-pkey,
+priv-pkey_index))
clear_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
else
set_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
@@ -987,15 +990,17 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv 
*priv,
up_read(priv-vlan_rwsem);
  
  	if (!test_bit(IPOIB_FLAG_INITIALIZED, priv-flags)) {

-   /* for non-child devices must check/update the pkey value here 
*/
-   if (level == IPOIB_FLUSH_HEAVY 
-   !test_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags))
-   update_parent_pkey(priv);
-   ipoib_dbg(priv, Not flushing - IPOIB_FLAG_INITIALIZED not 
set.\n);
-   return;
+   if (level != IPOIB_FLUSH_HEAVY) {
+   ipoib_dbg(priv, Not flushing - IPOIB_FLAG_INITIALIZED not 
set.\n);
+   return;
+   }
}
  
  	if (!test_bit(IPOIB_FLAG_ADMIN_UP, priv-flags)) {

+   /* interface is down. update pkey and leave. */
+   if (level == IPOIB_FLUSH_HEAVY 
+   !test_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags))
+   update_parent_pkey(priv);
ipoib_dbg(priv, Not flushing - IPOIB_FLAG_ADMIN_UP not 
set.\n);
return;
}
@@ -1038,8 +1043,10 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv 
*priv,
ipoib_ib_dev_down(dev, 0);
  
  	if (level == IPOIB_FLUSH_HEAVY) {

-   ipoib_ib_dev_stop(dev, 0);
-   ipoib_ib_dev_open(dev);
+   if (test_bit(IPOIB_FLAG_INITIALIZED, priv-flags))
+   ipoib_ib_dev_stop(dev, 0);
+   if (ipoib_ib_dev_open(dev) != 0)
+

Re: [PATCH v3 1/1] IPoIB: Improve parent interface p_key handling.

2014-06-16 Thread Erez Shitrit


6/11/2014 10:22 PM, Alex Estrin:

With reference to commit c2904141696ee19551f155396f23cdd5d95e
(IPoIB: Fix pkey change flow for virtualization environments)
It was noticed that parent interface keeps sending broadcast group
join requests if p_key index 0 is invalid. That creates unnecessary
noise on a fabric:

ib0: multicast join failed for ff12:401b:8000:::::,
status -22

Proposed patch re-init resources, then brings interface
up only if p_key idx 0 is valid either on bootup or on PKEY_CHANGE event.
Original code could run multicast task regardless of p_key value,
which we want to avoid.

Reviewed-by: Ira Weiny ira.we...@intel.com
Signed-off-by: Alex Estrin alex.est...@intel.com

Acked-by: Erez Shitrit ere...@dev.mellanox.co.il

---
  drivers/infiniband/ulp/ipoib/ipoib_ib.c |   35 +++
  1 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 6a7003d..a2af996 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -52,6 +52,7 @@ MODULE_PARM_DESC(data_debug_level,
  #endif
  
  static DEFINE_MUTEX(pkey_mutex);

+static void ipoib_pkey_dev_check_presence(struct net_device *dev);
  
  struct ipoib_ah *ipoib_create_ah(struct net_device *dev,

 struct ib_pd *pd, struct ib_ah_attr *attr)
@@ -669,12 +670,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret;
  
-	if (ib_find_pkey(priv-ca, priv-port, priv-pkey, priv-pkey_index)) {

-   ipoib_warn(priv, P_Key 0x%04x not found\n, priv-pkey);
-   clear_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
+   ipoib_pkey_dev_check_presence(dev);
+
+   if (!test_bit(IPOIB_PKEY_ASSIGNED, priv-flags)) {
+   ipoib_warn(priv, P_Key 0x%04x is %s\n, priv-pkey,
+   !(priv-pkey  0x7fff) ? Invalid : not found);
return -1;
}
-   set_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
  
  	ret = ipoib_init_qp(dev);

if (ret) {
@@ -712,9 +714,10 @@ dev_stop:
  static void ipoib_pkey_dev_check_presence(struct net_device *dev)
  {
struct ipoib_dev_priv *priv = netdev_priv(dev);
-   u16 pkey_index = 0;
  
-	if (ib_find_pkey(priv-ca, priv-port, priv-pkey, pkey_index))

+   if (!(priv-pkey  0x7fff) ||
+   ib_find_pkey(priv-ca, priv-port, priv-pkey,
+   priv-pkey_index))
clear_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
else
set_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
@@ -987,15 +990,17 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv 
*priv,
up_read(priv-vlan_rwsem);
  
  	if (!test_bit(IPOIB_FLAG_INITIALIZED, priv-flags)) {

-   /* for non-child devices must check/update the pkey value here 
*/
-   if (level == IPOIB_FLUSH_HEAVY 
-   !test_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags))
-   update_parent_pkey(priv);
-   ipoib_dbg(priv, Not flushing - IPOIB_FLAG_INITIALIZED not 
set.\n);
-   return;
+   if (level != IPOIB_FLUSH_HEAVY) {
+   ipoib_dbg(priv, Not flushing - IPOIB_FLAG_INITIALIZED not 
set.\n);
+   return;
+   }
}
  
  	if (!test_bit(IPOIB_FLAG_ADMIN_UP, priv-flags)) {

+   /* interface is down. update pkey and leave. */
+   if (level == IPOIB_FLUSH_HEAVY 
+   !test_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags))
+   update_parent_pkey(priv);
ipoib_dbg(priv, Not flushing - IPOIB_FLAG_ADMIN_UP not 
set.\n);
return;
}
@@ -1038,8 +1043,10 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv 
*priv,
ipoib_ib_dev_down(dev, 0);
  
  	if (level == IPOIB_FLUSH_HEAVY) {

-   ipoib_ib_dev_stop(dev, 0);
-   ipoib_ib_dev_open(dev);
+   if (test_bit(IPOIB_FLAG_INITIALIZED, priv-flags))
+   ipoib_ib_dev_stop(dev, 0);
+   if (ipoib_ib_dev_open(dev) != 0)
+   return;
}
  
  	/*


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2 1/1] IPoIB: Improve parent interface p_key handling.

2014-06-10 Thread Erez Shitrit


Hi Alex,

Perhaps i am missing something, but in my understanding the facts ar as 
the following:


- ib_register_event_handler is called in add_port at the load time of 
the driver, when the ib ports recognized, in that function the driver 
queries for pkey index 0.


- ipoib_pkey_dev_delay_open only seeks for the value that already should 
be in priv-pkey, someone needs to fill it with the right value.


so, the case as i see it is:

add_one() --no valid pkey in index 0
...
...
ipoib_stop() // via ifconfig ib0 down or alike
.
event: PKEY_CHANGE - here the ADMIN_UP is clear so there will be no 
query for pkey-index-0

.
ipoib_open()

and now the driver left with no valid value till the next PKEY_CHANGE event.

Thanks, Erez

 6/10/2014 4:39 PM, Estrin, Alex:

Hi Erez,
Please see below.
Thanks,
Alex.


-Original Message-
From: Erez Shitrit [mailto:ere...@dev.mellanox.co.il]
Sent: Tuesday, June 10, 2014 1:49 AM
To: Estrin, Alex
Cc: Roland Dreier; linux-rdma@vger.kernel.org
Subject: Re: [PATCH v2 1/1] IPoIB: Improve parent interface p_key handling.

Hi Alex,
one comment (more specific about a comment i wrote before)

all the rest looks good to me.

Thanks, Erez

6/10/2014 12:55 AM, Alex Estrin:

With reference to commit c2904141696ee19551f155396f23cdd5d95e.
It was noticed that parent interface keeps sending broadcast group
join requests if p_key index 0 is invalid. That creates unnecessary
noise on a fabric:

ib0: multicast join failed for ff12:401b:8000:::::,
status -22

Proposed patch re-init resources, then brings interface
up only if p_key idx 0 is valid either on bootup or on PKEY_CHANGE event.
Original code could run multicast task regardless of p_key value,
which we want to avoid.

Modified event handler will utilize following strategy:
if interface is not initialized and event is not PKEY_CHANGE related - return.
call update_parent_pkey() - if pkey hasn't changed - return.
if interface is initialized
  flush it - call ipoib_ib_dev_stop() - de-initialized.
Then start multicast task only if ipoib_ib_dev_open() has succeeded, 
reinitialized,
i.e. p_key is valid.

Changes from v1:
p_key check for 'Invalid' value was moved to
ipoib_pkey_dev_check_presence() that is used now in ipoib_ib_dev_open()
for p_key validation.
Whitespace and format adjusted.

Reviewed-by: Ira Weiny ira.we...@intel.com
Signed-off-by: Alex Estrin alex.est...@intel.com
---
   drivers/infiniband/ulp/ipoib/ipoib_ib.c |   31 
+--
   1 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c

b/drivers/infiniband/ulp/ipoib/ipoib_ib.c

index 6a7003d..627f74f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -52,6 +52,7 @@ MODULE_PARM_DESC(data_debug_level,
   #endif

   static DEFINE_MUTEX(pkey_mutex);
+static void ipoib_pkey_dev_check_presence(struct net_device *dev);

   struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
 struct ib_pd *pd, struct ib_ah_attr *attr)
@@ -669,12 +670,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret;

-   if (ib_find_pkey(priv-ca, priv-port, priv-pkey, priv-pkey_index)) {
-   ipoib_warn(priv, P_Key 0x%04x not found\n, priv-pkey);
-   clear_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
+   ipoib_pkey_dev_check_presence(dev);
+
+   if (!test_bit(IPOIB_PKEY_ASSIGNED, priv-flags)) {
+   ipoib_warn(priv, P_Key 0x%04x is %s\n, priv-pkey,
+   !(priv-pkey  0x7fff) ? Invalid : not found);
return -1;
}
-   set_bit(IPOIB_PKEY_ASSIGNED, priv-flags);

ret = ipoib_init_qp(dev);
if (ret) {
@@ -712,9 +714,10 @@ dev_stop:
   static void ipoib_pkey_dev_check_presence(struct net_device *dev)
   {
struct ipoib_dev_priv *priv = netdev_priv(dev);
-   u16 pkey_index = 0;

-   if (ib_find_pkey(priv-ca, priv-port, priv-pkey, pkey_index))
+   if (!(priv-pkey  0x7fff) ||
+   ib_find_pkey(priv-ca, priv-port, priv-pkey,
+   priv-pkey_index))
clear_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
else
set_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
@@ -987,12 +990,10 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv 
*priv,
up_read(priv-vlan_rwsem);

if (!test_bit(IPOIB_FLAG_INITIALIZED, priv-flags)) {
-   /* for non-child devices must check/update the pkey value here 
*/
-   if (level == IPOIB_FLUSH_HEAVY 
-   !test_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags))
-   update_parent_pkey(priv);
-   ipoib_dbg(priv, Not flushing - IPOIB_FLAG_INITIALIZED not 
set.\n);
-   return;
+   if (level  IPOIB_FLUSH_HEAVY) {
+   ipoib_dbg(priv

Re: [PATCH v2 1/1] IPoIB: Improve parent interface p_key handling.

2014-06-09 Thread Erez Shitrit


Hi Alex,
one comment (more specific about a comment i wrote before)

all the rest looks good to me.

Thanks, Erez

6/10/2014 12:55 AM, Alex Estrin:

With reference to commit c2904141696ee19551f155396f23cdd5d95e.
It was noticed that parent interface keeps sending broadcast group
join requests if p_key index 0 is invalid. That creates unnecessary
noise on a fabric:

ib0: multicast join failed for ff12:401b:8000:::::,
status -22

Proposed patch re-init resources, then brings interface
up only if p_key idx 0 is valid either on bootup or on PKEY_CHANGE event.
Original code could run multicast task regardless of p_key value,
which we want to avoid.

Modified event handler will utilize following strategy:
if interface is not initialized and event is not PKEY_CHANGE related - return.
call update_parent_pkey() - if pkey hasn't changed - return.
if interface is initialized
 flush it - call ipoib_ib_dev_stop() - de-initialized.
Then start multicast task only if ipoib_ib_dev_open() has succeeded, 
reinitialized,
i.e. p_key is valid.

Changes from v1:
p_key check for 'Invalid' value was moved to
ipoib_pkey_dev_check_presence() that is used now in ipoib_ib_dev_open()
for p_key validation.
Whitespace and format adjusted.

Reviewed-by: Ira Weiny ira.we...@intel.com
Signed-off-by: Alex Estrin alex.est...@intel.com
---
  drivers/infiniband/ulp/ipoib/ipoib_ib.c |   31 +--
  1 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 6a7003d..627f74f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -52,6 +52,7 @@ MODULE_PARM_DESC(data_debug_level,
  #endif
  
  static DEFINE_MUTEX(pkey_mutex);

+static void ipoib_pkey_dev_check_presence(struct net_device *dev);
  
  struct ipoib_ah *ipoib_create_ah(struct net_device *dev,

 struct ib_pd *pd, struct ib_ah_attr *attr)
@@ -669,12 +670,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret;
  
-	if (ib_find_pkey(priv-ca, priv-port, priv-pkey, priv-pkey_index)) {

-   ipoib_warn(priv, P_Key 0x%04x not found\n, priv-pkey);
-   clear_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
+   ipoib_pkey_dev_check_presence(dev);
+
+   if (!test_bit(IPOIB_PKEY_ASSIGNED, priv-flags)) {
+   ipoib_warn(priv, P_Key 0x%04x is %s\n, priv-pkey,
+   !(priv-pkey  0x7fff) ? Invalid : not found);
return -1;
}
-   set_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
  
  	ret = ipoib_init_qp(dev);

if (ret) {
@@ -712,9 +714,10 @@ dev_stop:
  static void ipoib_pkey_dev_check_presence(struct net_device *dev)
  {
struct ipoib_dev_priv *priv = netdev_priv(dev);
-   u16 pkey_index = 0;
  
-	if (ib_find_pkey(priv-ca, priv-port, priv-pkey, pkey_index))

+   if (!(priv-pkey  0x7fff) ||
+   ib_find_pkey(priv-ca, priv-port, priv-pkey,
+   priv-pkey_index))
clear_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
else
set_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
@@ -987,12 +990,10 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv 
*priv,
up_read(priv-vlan_rwsem);
  
  	if (!test_bit(IPOIB_FLAG_INITIALIZED, priv-flags)) {

-   /* for non-child devices must check/update the pkey value here 
*/
-   if (level == IPOIB_FLUSH_HEAVY 
-   !test_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags))
-   update_parent_pkey(priv);
-   ipoib_dbg(priv, Not flushing - IPOIB_FLAG_INITIALIZED not 
set.\n);
-   return;
+   if (level  IPOIB_FLUSH_HEAVY) {
+   ipoib_dbg(priv, Not flushing - IPOIB_FLAG_INITIALIZED not 
set.\n);
+   return;
+   }
}
  
I think that If you take these lines and the IPOB_FLAG_ADMIN_UP is not 
set, you will miss that event and will not read the pkey in index 0.
The assumption is that FLAG_INITIALIZED comes after ADMIN_UP so, you 
can find a case where both of them are not set, the main idea is no 
mather what is the priv state the driver should handle the PKEY_CHANGE 
event.

if (!test_bit(IPOIB_FLAG_ADMIN_UP, priv-flags)) {
@@ -1038,8 +1039,10 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv 
*priv,
ipoib_ib_dev_down(dev, 0);
  
  	if (level == IPOIB_FLUSH_HEAVY) {

-   ipoib_ib_dev_stop(dev, 0);
-   ipoib_ib_dev_open(dev);
+   if (test_bit(IPOIB_FLAG_INITIALIZED, priv-flags))
+   ipoib_ib_dev_stop(dev, 0);
+   if (ipoib_ib_dev_open(dev) != 0)
+   return;
}
  
  	/*


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message

Re: [PATCH 1/1] IPoIB: Improve parent interface p_key handling.

2014-06-08 Thread Erez Shitrit


Hi Alex,
it is a good idea, i have 2 comments.

Thanks, Erez

Resending as there was a typo in Subject line.

--

With reference to commit c2904141696ee19551f155396f23cdd5d95e.
It was noticed that parent interface keeps sending broadcast group
join requests if p_key index 0 is invalid. That creates unnecessary
noise on a fabric:

ib0: multicast join failed for ff12:401b:8000:::::,
  status -22

Proposed patch re-init resources, then brings interface
up only if p_key idx 0 is valid either on bootup or on PKEY_CHANGE event.
Otherwise it keeps interface quiet.

Reviewed-by: Ira Weiny ira.we...@intel.com
Signed-off-by: Alex Estrin alex.est...@intel.com
---
  drivers/infiniband/ulp/ipoib/ipoib_ib.c |   25 -
  1 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 6a7003d..3201fe5 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -669,6 +669,12 @@ int ipoib_ib_dev_open(struct net_device *dev)
struct ipoib_dev_priv *priv = netdev_priv(dev);
int ret;
  
+	if (!(priv-pkey  0x7fff)) {

+   ipoib_warn(priv, Invalid P_Key\n);
+   clear_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
+   return -1;
+   }
+
if (ib_find_pkey(priv-ca, priv-port, priv-pkey, priv-pkey_index)) {
ipoib_warn(priv, P_Key 0x%04x not found\n, priv-pkey);
clear_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
Instead of that check here, you can call again to 
ipoib_pkey_dev_check_presence, one place for checking that.

(like it is in ipoib_ib_dev_up)


@@ -714,7 +720,8 @@ static void ipoib_pkey_dev_check_presence(struct net_device 
*dev)
struct ipoib_dev_priv *priv = netdev_priv(dev);
u16 pkey_index = 0;
  
-	if (ib_find_pkey(priv-ca, priv-port, priv-pkey, pkey_index))

+   if (!(priv-pkey  0x7fff) ||
+   ib_find_pkey(priv-ca, priv-port, priv-pkey, pkey_index))
Here should be the only place for checking, perhaps you will want to 
change the prototype of the function, to return int instead.

clear_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
else
set_bit(IPOIB_PKEY_ASSIGNED, priv-flags);
@@ -987,12 +994,10 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv 
*priv,
up_read(priv-vlan_rwsem);
  
  	if (!test_bit(IPOIB_FLAG_INITIALIZED, priv-flags)) {

-   /* for non-child devices must check/update the pkey value here 
*/
-   if (level == IPOIB_FLUSH_HEAVY 
-   !test_bit(IPOIB_FLAG_SUBINTERFACE, priv-flags))
-   update_parent_pkey(priv);
-   ipoib_dbg(priv, Not flushing - IPOIB_FLAG_INITIALIZED not 
set.\n);
-   return;
+   if (level  IPOIB_FLUSH_HEAVY) {
+   ipoib_dbg(priv, Not flushing - IPOIB_FLAG_INITIALIZED not 
set.\n);
+   return;
+   }
}
  
i think that you cannot skip the call for update_parent_pkey, otherwise 
if the pkey value in index 0 was changed before the bit 
IPOIB_FLAG_INITIALIZED was set, the pkey will not be updated.

so, IMHO you can leave the code at that point, as it was before.

if (!test_bit(IPOIB_FLAG_ADMIN_UP, priv-flags)) {
@@ -1038,8 +1043,10 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv 
*priv,
ipoib_ib_dev_down(dev, 0);
  
  	if (level == IPOIB_FLUSH_HEAVY) {

-   ipoib_ib_dev_stop(dev, 0);
-   ipoib_ib_dev_open(dev);
+   if (test_bit(IPOIB_FLAG_INITIALIZED, priv-flags))
+   ipoib_ib_dev_stop(dev, 0);
+   if (ipoib_ib_dev_open(dev) != 0)
+   return;
}
  
  	/*


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: AW: AW: AW: IPoIB GRO

2013-11-05 Thread Erez Shitrit




I see. This didn't happen on our setups here since we tests with
newer cards (ConnectX2/3/3-pro).
For ConnectX1 (A0) and this firmware that you are using smells
like something goes wrong. If possible, I would change to newish
card.

No problem with that. My journey up to here was hard but very
interesting. Especially when you expect everything in the system
to be consistent and new speedups with every new kernel or
driver version. Encountering a throughput drop of nearly 50%
with the upgrade of our NFS servers I was challenged.

With TSO disabled on our old cards I'm back to LRO speeds and
I'm more than happy with that.

Just a final clarification for the interested reader: Are the TCP Ids
in an TSO setup generated through firmware or in the software
stack? And if in firmware: How does the card know how to
increase them? I would expect that it only works with IB packets
and does not know of the IP encapsulation.
The card (HW) knows how to deal with IP packets, the card is configured 
via the FW to increase the ip-id for each ip packet that it is part of 
the full message.


so, to summarize:
The HW does the work (truncates the big ip packet to series of ip 
packets, each with the relevant mtu size and increases the ip-id for each)

The FW enables that work on the HW
the FW in A0 card doesn't enable that option for the HW.



Best regards.

Markus


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: IPoIB GRO

2013-11-04 Thread Erez Shitrit


Hi Markus,

As Or already mentioned, it seems that we have accumulations of ip 
packets, when GRO is enabled over ib interface, from tcpdump in the 
recieve side we can see:


10:09:27.336951 IP 11.134.33.1.41377  11.134.41.1.35957: Flags [.], seq 
3795959253:3796023381, ack 2, win 110, length 64128
10:09:27.336987 IP 11.134.41.1.35957  11.134.33.1.41377: Flags [.], ack 
3796023381, win 2036, length 0
10:09:27.337022 IP 11.134.33.1.41377  11.134.41.1.35957: Flags [.], seq 
3796023381:3796087509, ack 2, win 110, length 64128
10:09:27.337044 IP 11.134.41.1.35957  11.134.33.1.41377: Flags [.], ack 
3796087509, win 3038, length 0
10:09:27.337083 IP 11.134.33.1.41377  11.134.41.1.35957: Flags [.], seq 
3796087509:3796151637, ack 2, win 110, length 64128
10:09:27.337107 IP 11.134.41.1.35957  11.134.33.1.41377: Flags [.], ack 
3796151637, win 4040, length 0
10:09:27.337142 IP 11.134.33.1.41377  11.134.41.1.35957: Flags [.], seq 
3796151637:3796215765, ack 2, win 110, length 64128

.


don't you see that behaviour in tcpdump? what kernel are you using?

I will take a look into the gro/our code to check if we missed 
something, and update.


Thanks, Erez


Hello,

I have a little update to the unlucky GRO IPoIB behaviour I observed
in the last weeks in datagram mode on our ConnectX cards. In the
GRO receive path the kernel steps into the inet_gro_receive() function
of net/ipv4/af_inet.c. If I read the code right it compares two
IP packets and decides if they come from the same flow.
Further checks are included in some subroutines that narrow
down the comparison to IPv4 and so on.

I put a debugging message into the following comparison that
seems to be the culprit of it all.

inet_gro_receive()
   ...
   /* All fields must match except length and checksum. */
   NAPI_GRO_CB(p)-flush |=
 (iph-ttl ^ iph2-ttl) |
 (iph-tos ^ iph2-tos) |
 (__force int)((iph-frag_off ^ iph2-frag_off)  htons(IP_DF)) |
 ((u16)(ntohs(iph2-id) + NAPI_GRO_CB(p)-count) ^ id);
   /* Do some debug */
   printk(%i %i %i\n,ntohs(iph2-id),NAPI_GRO_CB(p)-count,id);
   ...

On a normal GBit Intel card the kernel output reads:

32933 12 32945
32933 13 32946
32946 1 32947
32946 2 32948
...
32946 15 32961
32964 3 32967
32964 4 32968
...

The interpretation of it all should be that packet ids must match
the sum of the initial packet id plus its count field. Then
we have a GRO candidate.

On our ib0 interface the count field of a received packet seems
to be 1 most of the time and the packet id always matches the
initial packet id:

35754 1 35754
35754 1 35754
35754 1 35754
...
35754 1 35786
35786 1 35786
35786 1 35786
...

Thats why the flush flag is always set and the GRO stack does
not work at all. I'm willing to dig deeper into this but I'm unsure
if those fields are filled on sender or receiver side and especially
where in the IPoIB stack. Maybe someone can point me into the
right direction so that I can dig deeper and provide some more
information.

Bet regards.

Markus



--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: AW: IPoIB GRO

2013-11-04 Thread Erez Shitrit


Hi Markus,

Can you please tell me what is the FW version you have on your ConnectX 
cards?


Thanks, Erez


Hi Erez,


don't you see that behaviour in tcpdump? what kernel are you using?

On server side we have a 3.5 on client side a 3.11 kernel each of them with
kernel standard drivers/modules. I can see the same pattern of GRO
aggregation on the client that you mention but only if I disable TSO for
ib0 on the server side.

The test I'm running on the client is like this. The second and third read
run are definetly served by the NFS server side cache.

sysctl -w net.ipv4.tcp_mem=4096 65536 4194304
sysctl -w net.ipv4.tcp_rmem=4096 65536 4194304
sysctl -w net.ipv4.tcp_wmem=4096 65536 4194304
sysctl -w net.core.rmem_max=8388608
sysctl -w net.core.wmem_max=8388608

mount -o nfsvers=3,rsize=262144,wsize=262144 10.10.30.251:/export /mnt
echo 3  /proc/sys/vm/drop_caches
dd if=/mnt/xxx.iso of=/dev/null bs=1M count=5000
echo 3  /proc/sys/vm/drop_caches
dd if=/mnt/xxx.iso of=/dev/null bs=1M count=5000
echo 3  /proc/sys/vm/drop_caches
dd if=/mnt/xxx.iso of=/dev/null bs=1M count=5000
umount /mnt

Markus


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: AW: ACK behaviour difference LRO/GRO

2013-10-29 Thread Erez Shitrit


בתאריך 10/29/2013 1:43 PM, ציטוט Or Gerlitz:

On 29/10/2013 13:10, Markus Stockhausen wrote:

Just to be on the right way: What are the basics to get GRO working
with a ConnectX (not 2 or 3) card in 2044 MTU datagram mode?

- enable GRO with ethtool.
- Activate Coalescing with ethtool? If yes how?
GRO is SW element of the network stack, so the HCA doesn't matter, you 
need IPoIB to support that correct and enable it with ethtool

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html


In addition to what Or just wrote,
GRO currently doesn't work on ipoib interfaces, that according to bad 
handling mac address that are not 6 bytes (we have plans to fix that in 
the near future), that is the reason you don't see 64k packets on 
tcpdump (like you see in LRO).


you have few options:
1. use old kernels and enable LRO (which works fine with ipoib)
2. use CM mode instead of UD, that probably will improve the performance.

Thanks, Erez
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/1] ib/ipoib: Added adaptive moderation algorithm for better latency.

2011-08-14 Thread Erez Shitrit

Hi Roland,

The following patch adds a new algorithm for adaptive moderation.
The main idea is changing the CQ moderation (only for the RX) according to the 
traffic.
The Adaptive moderation is controlled via ethtool: adaptive-rx on/off.
(more details in the patch itself.)

Some results without/with adaptive-rx:
--
for latency i run the test: 
netperf -n 8 -H 11.134.14.1 -c -C -P 1 -t UDP_RR -l 10
for BW i run the test:
iperf -c 11.134.14.1 -l 64K -P8

first setup:
no adaptive moderation, default moderation values:
 rx-usecs: 0 rx-frames: 0
we will get the next results:
latency 9 usec 
BW 6.60 Gbits/sec

second setup:
no adaptive moderation, manually, moderation values:
 rx-usecs: 10 rx-frames: 44
we will get the next results:
latency 18 usec 
BW 8.50 Gbits/sec

third setup:
adaptive moderation on (Adaptive RX: on)
we will get the next results:
latency 9 usec 
BW 8.60 Gbits/sec


As you can see, the adaptive moderation takes the good both tests (from 
different kind of traffic).

Thanks, Erez
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/1] ib/ipoib: Added adaptive moderation algorithm for better latency.

2011-08-14 Thread Erez Shitrit

From 8ea4a6d4387a07b4e0abfb92f164f5181cf636e4 Mon Sep 17 00:00:00 2001
From: Erez Shitrit ere...@mellanox.co.il
Date: Thu, 30 Jun 2011 09:58:09 +0300
Subject: [PATCH 1/2] ib/ipoib: Added adaptive moderation algorithm for better 
latency.

[PATCH V2]:
Adaptive moderation is controlled via ethtool: adaptive-rx on/off.

When adaptive moderation is enabled,the adaptive moderation task is started.
 The task runs on new workqueue, and reschedule itself for the next run.
 The task periodically (every 250 ms) samples the traffic (packet rate and 
average packet sizes),
 and runs an algorithm to define a new moderation time for the receive queue.
 The algorithm classifies the incoming traffic during each sampling interval
 into classes. The rx_usec value (i.e., moderation time) is then adjusted 
appropriately per class.

 There are two classes defined:

  A.  Bulk traffic: for heavy traffic consisting of packets of normal size.
  This class is further divided into two sub-classes:
1. Traffic that is mainly BW bound
- This traffic will get maximum moderation.
2. Traffic that is mostly latency bound
- For situations where low latency is vital such as cluster or 
grid computing
- For this traffic the rx_usec will be changed to a value in the 
range (ethtool.pkt_rate_low  .. ethtool.pkt_rate_high) depending on sampled 
packet rate.

  B.  Low latency traffic: for minimal traffic, or traffic consisting almost 
completely of small packets.
- This traffic will get minimum moderation.

Signed-off-by: Erez Shitrit ere...@mellanox.co.il
Reviewed-by: Eli cohen e...@mellanox.co.il
---
 drivers/infiniband/ulp/ipoib/ipoib.h   |   39 ++-
 drivers/infiniband/ulp/ipoib/ipoib_ethtool.c   |   67 +-
 drivers/infiniband/ulp/ipoib/ipoib_ib.c|6 +
 drivers/infiniband/ulp/ipoib/ipoib_main.c  |  167 
+++-
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |8 +-
 5 files changed, 278 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
b/drivers/infiniband/ulp/ipoib/ipoib.h
index 7b6985a..c58f231 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -91,6 +91,7 @@ enum {
IPOIB_STOP_REAPER = 7,
IPOIB_FLAG_ADMIN_CM   = 9,
IPOIB_FLAG_UMCAST = 10,
+   IPOIB_FLAG_AUTO_MODER = 11, /*indicates moderation is running*/
 
IPOIB_MAX_BACKOFF_SECONDS = 16,
 
@@ -253,9 +254,43 @@ struct ipoib_cm_dev_priv {
int num_frags;
 };
 
+/* adaptive moderation parameters: */
+enum {
+   /* Target number of packets to coalesce with interrupt moderation */
+   IPOIB_RX_COAL_TARGET= 44,
+   IPOIB_RX_COAL_TIME  = 16,
+   IPOIB_TX_COAL_PKTS  = 5,
+   IPOIB_TX_COAL_TIME  = 0x80,
+   IPOIB_RX_RATE_LOW   = 40,
+   IPOIB_RX_COAL_TIME_LOW  = 0,
+   IPOIB_RX_RATE_HIGH  = 45,
+   IPOIB_RX_COAL_TIME_HIGH = 128,
+   IPOIB_RX_SIZE_THRESH= 1024,
+   IPOIB_RX_RATE_THRESH= 100 / IPOIB_RX_COAL_TIME_HIGH,
+   IPOIB_SAMPLE_INTERVAL   = 0,
+   IPOIB_AVG_PKT_SMALL = 256,
+   IPOIB_AUTO_CONF = 0x,
+   ADAPT_MODERATION_DELAY  = HZ / 4,
+};
+
 struct ipoib_ethtool_st {
-   u16 coalesce_usecs;
+   __u32 rx_max_coalesced_frames;
+   __u32 rx_coalesce_usecs;
+/* u16 coalesce_usecs;
u16 max_coalesced_frames;
+*/
+   __u32   pkt_rate_low;
+   __u32   pkt_rate_high;
+   __u32   rx_coalesce_usecs_low;
+   __u32   rx_coalesce_usecs_high;
+   __u32   rate_sample_interval;
+   __u32   use_adaptive_rx_coalesce;
+   int last_moder_time;
+   u16 sample_interval;
+   unsigned long last_moder_jiffies;
+   unsigned long last_moder_packets;
+   unsigned long last_moder_tx_packets;
+   unsigned long last_moder_bytes;
 };
 
 /*
@@ -289,6 +324,7 @@ struct ipoib_dev_priv {
struct work_struct flush_heavy;
struct work_struct restart_task;
struct delayed_work ah_reap_task;
+   struct delayed_work adaptive_moder_task;
 
struct ib_device *ca;
u8port;
@@ -409,6 +445,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour 
*neigh,
 void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh);
 
 extern struct workqueue_struct *ipoib_workqueue;
+extern struct workqueue_struct *ipoib_auto_moder_workqueue;
 
 /* functions */
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c 
b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
index 29bc7b5..b41c061 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -46,18 +46,30 @@ static int ipoib_get_coalesce(struct net_device *dev,
  struct ethtool_coalesce *coal)
 {
struct ipoib_dev_priv *priv = netdev_priv(dev

[PATCH 0/1] mlx4-core: enable changing default max HCA resource limits.

2011-08-03 Thread Erez Shitrit

Hi Roland,

This patch was first posted on Oct 16, 2007 by Jack Morgenstein (but got 
overlooked).

According to customers request I'm re-posting its current incarnation.

Please queue up for kernel.

Thanks, Erez
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/1] mlx4-core: enable changing default max HCA resource limits.

2011-08-03 Thread Erez Shitrit

Enable module-initialization time modification of default HCA
maximum resource limits via module parameters, as is done in mthca.

Specify the log of the parameter value, rather than the value itself
to avoid the hidden side-effect of rounding up values to next power-of-2.

Signed-off-by: Jack Morgenstein ja...@mellanox.co.il
Signed-off-by: Erez Shitrit ere...@mellanox.co.il
---
 drivers/net/mlx4/main.c |   51 
+++
 1 files changed, 51 insertions(+), 0 deletions(-)

diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 3814fc9..0c36854 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -106,6 +106,56 @@ static int log_mtts_per_seg = ilog2
(MLX4_MTT_ENTRY_PER_SEG);
 module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
 MODULE_PARM_DESC(log_mtts_per_seg, Log2 number of MTT entries per segment 
(1-7));
 
+static struct mlx4_profile mod_param_profile = { 0 };
+
+module_param_named(log_num_qp, mod_param_profile.num_qp, int, 0444);
+MODULE_PARM_DESC(log_num_qp, log maximum number of QPs per HCA );
+
+module_param_named(log_num_srq, mod_param_profile.num_srq, int, 0444);
+MODULE_PARM_DESC(log_num_srq, log maximum number of SRQs per HCA );
+
+module_param_named(log_rdmarc_per_qp, mod_param_profile.rdmarc_per_qp, int, 
0444);
+MODULE_PARM_DESC(log_rdmarc_per_qp, log number of RDMARC buffers per QP );
+
+module_param_named(log_num_cq, mod_param_profile.num_cq, int, 0444);
+MODULE_PARM_DESC(log_num_cq, log maximum number of CQs per HCA );
+
+module_param_named(log_num_mcg, mod_param_profile.num_mcg, int, 0444);
+MODULE_PARM_DESC(log_num_mcg, log maximum number of multicast groups per 
HCA );
+
+module_param_named(log_num_mpt, mod_param_profile.num_mpt, int, 0444);
+MODULE_PARM_DESC(log_num_mpt,
+   log maximum number of memory protection table entries per HCA 
);
+
+module_param_named(log_num_mtt, mod_param_profile.num_mtt, int, 0444);
+MODULE_PARM_DESC(log_num_mtt,
+log maximum number of memory translation table segments per 
HCA );
+
+static void process_mod_param_profile(void)
+{
+   default_profile.num_qp = (mod_param_profile.num_qp ?
+ 1  mod_param_profile.num_qp :
+ default_profile.num_qp);
+   default_profile.num_srq = (mod_param_profile.num_srq ?
+ 1  mod_param_profile.num_srq :
+ default_profile.num_srq);
+   default_profile.rdmarc_per_qp = (mod_param_profile.rdmarc_per_qp ?
+ 1  mod_param_profile.rdmarc_per_qp :
+ default_profile.rdmarc_per_qp);
+   default_profile.num_cq = (mod_param_profile.num_cq ?
+ 1  mod_param_profile.num_cq :
+ default_profile.num_cq);
+   default_profile.num_mcg = (mod_param_profile.num_mcg ?
+ 1  mod_param_profile.num_mcg :
+ default_profile.num_mcg);
+   default_profile.num_mpt = (mod_param_profile.num_mpt ?
+ 1  mod_param_profile.num_mpt :
+ default_profile.num_mpt);
+   default_profile.num_mtt = (mod_param_profile.num_mtt ?
+ 1  mod_param_profile.num_mtt :
+ default_profile.num_mtt);
+}
+
 int mlx4_check_port_params(struct mlx4_dev *dev,
   enum mlx4_port_type *port_type)
 {
@@ -791,6 +841,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
goto err_stop_fw;
}
 
+   process_mod_param_profile();
profile = default_profile;
 
icm_size = mlx4_make_profile(dev, profile, dev_cap, init_hca);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

58 matches

Mail list logo