Re: [PATCH V3] IB/mlx4: Use vmalloc for WR buffers when needed

2015-12-16 Thread Wengang Wang

Hi Matt,

Yes, you are right.
Since the patch is already merged in, I am going to make a separated 
patch for that.


thanks,
wengang

在 2015年12月12日 04:28, Matthew Finlay 写道:

Hi Wengang,

I was going through your patch set here, and it seems that you missed changing 
kfree to kvfree in mlx4_ib_destroy_srq().  In the current code if the srq wrid 
is allocated using vmalloc, then on cleanup we will use kfree, which is a bug.

Thanks,
-matt




On 10/7/15, 10:27 PM, "linux-rdma-ow...@vger.kernel.org on behalf of Wengang Wang" 
<linux-rdma-ow...@vger.kernel.org on behalf of wen.gang.w...@oracle.com> wrote:


There are several hits that WR buffer allocation(kmalloc) failed.
It failed at order 3 and/or 4 contigous pages allocation. At the same time
there are actually 100MB+ free memory but well fragmented.
So try vmalloc when kmalloc failed.

Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com>
Acked-by: Or Gerlitz <ogerl...@mellanox.com>
---
drivers/infiniband/hw/mlx4/qp.c  | 19 +--
drivers/infiniband/hw/mlx4/srq.c | 11 ---
2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 4ad9be3..3ccbd3a 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -34,6 +34,7 @@
#include 
#include 
#include 
+#include 

#include 
#include 
@@ -786,8 +787,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, 
struct ib_pd *pd,
if (err)
goto err_mtt;

-   qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp);
-   qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp);
+   qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp);
+   if (!qp->sq.wrid)
+   qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64),
+   gfp, PAGE_KERNEL);
+   qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp);
+   if (!qp->rq.wrid)
+   qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64),
+   gfp, PAGE_KERNEL);
if (!qp->sq.wrid || !qp->rq.wrid) {
err = -ENOMEM;
goto err_wrid;
@@ -874,8 +881,8 @@ err_wrid:
if (qp_has_rq(init_attr))
mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), 
>db);
} else {
-   kfree(qp->sq.wrid);
-   kfree(qp->rq.wrid);
+   kvfree(qp->sq.wrid);
+   kvfree(qp->rq.wrid);
}

err_mtt:
@@ -1050,8 +1057,8 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, 
struct mlx4_ib_qp *qp,
  >db);
ib_umem_release(qp->umem);
} else {
-   kfree(qp->sq.wrid);
-   kfree(qp->rq.wrid);
+   kvfree(qp->sq.wrid);
+   kvfree(qp->rq.wrid);
if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
free_proxy_bufs(>ib_dev, qp);
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index dce5dfe..8d133c4 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -34,6 +34,7 @@
#include 
#include 
#include 
+#include 

#include "mlx4_ib.h"
#include "user.h"
@@ -172,8 +173,12 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,

srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
if (!srq->wrid) {
-   err = -ENOMEM;
-   goto err_mtt;
+   srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64),
+ GFP_KERNEL, PAGE_KERNEL);
+   if (!srq->wrid) {
+   err = -ENOMEM;
+   goto err_mtt;
+   }
}
}

@@ -204,7 +209,7 @@ err_wrid:
if (pd->uobject)
mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), 
>db);
else
-   kfree(srq->wrid);
+   kvfree(srq->wrid);

err_mtt:
mlx4_mtt_cleanup(dev->dev, >mtt);
--
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

N�r��y���b�X��ǧv�^�)޺{.n�+{��ٚ�{ay�ʇڙ�,j��f���h���z��w���
���j:+v���w�j�mzZ+�ݢj"��!tml=


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] IB/mlx4: Replace kfree with kvfree in mlx4_ib_destroy_srq

2015-12-16 Thread Wengang Wang
Commit 0ef2f05c7e02ff99c0b5b583d7dee2cd12b053f2 uses vmalloc for WR buffers
when needed and uses kvfree to free the buffers. It missed changing kfree
to kvfree in mlx4_ib_destroy_srq().

Reported-by: Matthew Finaly <m...@mellanox.com>
Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com>
---
 drivers/infiniband/hw/mlx4/srq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 8d133c4..c394376 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -286,7 +286,7 @@ int mlx4_ib_destroy_srq(struct ib_srq *srq)
mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), 
>db);
ib_umem_release(msrq->umem);
} else {
-   kfree(msrq->wrid);
+   kvfree(msrq->wrid);
mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift,
  >buf);
mlx4_db_free(dev->dev, >db);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V3] IB/mlx4: Use vmalloc for WR buffers when needed

2015-10-07 Thread Wengang Wang
There are several hits that WR buffer allocation(kmalloc) failed.
It failed at order 3 and/or 4 contigous pages allocation. At the same time
there are actually 100MB+ free memory but well fragmented.
So try vmalloc when kmalloc failed.

Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com>
Acked-by: Or Gerlitz <ogerl...@mellanox.com>
---
 drivers/infiniband/hw/mlx4/qp.c  | 19 +--
 drivers/infiniband/hw/mlx4/srq.c | 11 ---
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 4ad9be3..3ccbd3a 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -786,8 +787,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, 
struct ib_pd *pd,
if (err)
goto err_mtt;
 
-   qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp);
-   qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp);
+   qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp);
+   if (!qp->sq.wrid)
+   qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64),
+   gfp, PAGE_KERNEL);
+   qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp);
+   if (!qp->rq.wrid)
+   qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64),
+   gfp, PAGE_KERNEL);
if (!qp->sq.wrid || !qp->rq.wrid) {
err = -ENOMEM;
goto err_wrid;
@@ -874,8 +881,8 @@ err_wrid:
if (qp_has_rq(init_attr))

mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db);
} else {
-   kfree(qp->sq.wrid);
-   kfree(qp->rq.wrid);
+   kvfree(qp->sq.wrid);
+   kvfree(qp->rq.wrid);
}
 
 err_mtt:
@@ -1050,8 +1057,8 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, 
struct mlx4_ib_qp *qp,
  >db);
ib_umem_release(qp->umem);
} else {
-   kfree(qp->sq.wrid);
-   kfree(qp->rq.wrid);
+   kvfree(qp->sq.wrid);
+   kvfree(qp->rq.wrid);
if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
free_proxy_bufs(>ib_dev, qp);
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index dce5dfe..8d133c4 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "mlx4_ib.h"
 #include "user.h"
@@ -172,8 +173,12 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 
srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
if (!srq->wrid) {
-   err = -ENOMEM;
-   goto err_mtt;
+   srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64),
+ GFP_KERNEL, PAGE_KERNEL);
+   if (!srq->wrid) {
+   err = -ENOMEM;
+   goto err_mtt;
+   }
}
}
 
@@ -204,7 +209,7 @@ err_wrid:
if (pd->uobject)
mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), 
>db);
else
-   kfree(srq->wrid);
+   kvfree(srq->wrid);
 
 err_mtt:
mlx4_mtt_cleanup(dev->dev, >mtt);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IPoIB: serialize changing on tx_outstanding

2015-10-07 Thread Wengang Wang

Hi Leon,

thanks for review.

在 2015年10月08日 12:33, Leon Romanovsky 写道:

On Mon, Sep 28, 2015 at 01:42:10PM +0800, Wengang Wang wrote:

The changing on tx_outstanding should be protected by spinlock or to be
atomic operations.

Such log is found in dmesg:

Sep 16 14:20:53 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034733, 
tx_tail 1034733, tx_outstanding 359 ipoib_sendq_size: 512
Sep 16 14:21:33 naep11x06 kernel: ib0: transmit timeout: latency 9560 msecs
Sep 16 14:21:33 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034854, 
tx_tail 1034854, tx_outstanding 511 ipoib_sendq_size: 512
Sep 16 14:21:38 naep11x06 kernel: ib0: transmit timeout: latency 14568 msecs
Sep 16 14:21:38 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034854, 
tx_tail 1034854, tx_outstanding 511 ipoib_sendq_size: 512

And the send queue of ib0 kept full. When transmit timeout is reported,
queue is reported as "stopped", but the IPoIB stuff tx_head and tx_tail
points to same value. I am not able to see such numbers in ipoib_cm_tx
(for CM) because I have no vmcore. Though I am not quite sure it's caused
by parallel access of tx_outstanding(send path VS interrup path), we really
need to serialize the changeing on tx_outstanding.

This patch also make sure the increase of tx_outstanding prior to the
calling of post_send to avoid the possible decreasing before increasing in
case the running of increasing is scheduled later than the interrupt
handler.

Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com>
---
  drivers/infiniband/ulp/ipoib/ipoib_cm.c | 40 +++--
  drivers/infiniband/ulp/ipoib/ipoib_ib.c | 24 ++--
  2 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index c78dc16..044da94 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -710,6 +710,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
*skb, struct ipoib_cm_
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_tx_buf *tx_req;
int rc;
+   unsigned long flags;
  
  	if (unlikely(skb->len > tx->mtu)) {

ipoib_warn(priv, "packet len %d (> %d) too long to send, 
dropping\n",
@@ -742,27 +743,36 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
*skb, struct ipoib_cm_
skb_orphan(skb);
skb_dst_drop(skb);
  
+	spin_lock_irqsave(>lock, flags);

+   if (++priv->tx_outstanding == ipoib_sendq_size) {
+   ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net 
queue\n",
+ tx->qp->qp_num);
+   netif_stop_queue(dev);
+   }
+   spin_unlock_irqrestore(>lock, flags);
+   if (netif_queue_stopped(dev)) {
+   rc = ib_req_notify_cq(priv->send_cq,
+   IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+   if (rc < 0)
+   ipoib_warn(priv, "request notify on send CQ failed\n");
+   else if (rc)
+   ipoib_send_comp_handler(priv->send_cq, dev);
+   }
+
rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req);
if (unlikely(rc)) {
ipoib_warn(priv, "post_send failed, error %d\n", rc);
++dev->stats.tx_errors;
+   spin_lock_irqsave(>lock, flags);
+   --priv->tx_outstanding;
+   if (netif_queue_stopped(dev))
+   netif_wake_queue(dev);
+   spin_unlock_irqrestore(>lock, flags);

Why are you locking the netif_* calls?


Yes, I intended to do that.   This make the accessing on tx_outstanding 
and the reopening of the send queue in the same atomic session which is 
the expected behavior.

Otherwise,  we may have the following problem:
#time order

thread1(on cpu1) thread2(on cpu2)
lock
modify/check tx_outstanding
unlock


lock
modify/check tx_outstanding
unlock

reopen queue


   stop queue


So that we actually want reopen the send queue, but the result is we 
stopped it.


thanks,
wengang


ipoib_dma_unmap_tx(priv, tx_req);
dev_kfree_skb_any(skb);
} else {
dev->trans_start = jiffies;
++tx->tx_head;
-
-   if (++priv->tx_outstanding == ipoib_sendq_size) {
-   ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net 
queue\n",
- tx->qp->qp_num);
-   netif_stop_queue(dev);
-   rc = ib_req_notify_cq(priv->send_cq,
-   IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
-   if (rc < 0)
-   ipoib_warn(priv, "req

Re: [PATCH] IPoIB: serialize changing on tx_outstanding

2015-10-07 Thread Wengang Wang

Hi,
Any comment on this patch?

thanks,
wengang

在 2015年09月28日 13:42, Wengang Wang 写道:

The changing on tx_outstanding should be protected by spinlock or to be
atomic operations.

Such log is found in dmesg:

Sep 16 14:20:53 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034733, 
tx_tail 1034733, tx_outstanding 359 ipoib_sendq_size: 512
Sep 16 14:21:33 naep11x06 kernel: ib0: transmit timeout: latency 9560 msecs
Sep 16 14:21:33 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034854, 
tx_tail 1034854, tx_outstanding 511 ipoib_sendq_size: 512
Sep 16 14:21:38 naep11x06 kernel: ib0: transmit timeout: latency 14568 msecs
Sep 16 14:21:38 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034854, 
tx_tail 1034854, tx_outstanding 511 ipoib_sendq_size: 512

And the send queue of ib0 kept full. When transmit timeout is reported,
queue is reported as "stopped", but the IPoIB stuff tx_head and tx_tail
points to same value. I am not able to see such numbers in ipoib_cm_tx
(for CM) because I have no vmcore. Though I am not quite sure it's caused
by parallel access of tx_outstanding(send path VS interrup path), we really
need to serialize the changeing on tx_outstanding.

This patch also make sure the increase of tx_outstanding prior to the
calling of post_send to avoid the possible decreasing before increasing in
case the running of increasing is scheduled later than the interrupt
handler.

Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com>
---
  drivers/infiniband/ulp/ipoib/ipoib_cm.c | 40 +++--
  drivers/infiniband/ulp/ipoib/ipoib_ib.c | 24 ++--
  2 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index c78dc16..044da94 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -710,6 +710,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
*skb, struct ipoib_cm_
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_tx_buf *tx_req;
int rc;
+   unsigned long flags;
  
  	if (unlikely(skb->len > tx->mtu)) {

ipoib_warn(priv, "packet len %d (> %d) too long to send, 
dropping\n",
@@ -742,27 +743,36 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
*skb, struct ipoib_cm_
skb_orphan(skb);
skb_dst_drop(skb);
  
+	spin_lock_irqsave(>lock, flags);

+   if (++priv->tx_outstanding == ipoib_sendq_size) {
+   ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net 
queue\n",
+ tx->qp->qp_num);
+   netif_stop_queue(dev);
+   }
+   spin_unlock_irqrestore(>lock, flags);
+   if (netif_queue_stopped(dev)) {
+   rc = ib_req_notify_cq(priv->send_cq,
+   IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+   if (rc < 0)
+   ipoib_warn(priv, "request notify on send CQ failed\n");
+   else if (rc)
+   ipoib_send_comp_handler(priv->send_cq, dev);
+   }
+
rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req);
if (unlikely(rc)) {
ipoib_warn(priv, "post_send failed, error %d\n", rc);
++dev->stats.tx_errors;
+   spin_lock_irqsave(>lock, flags);
+   --priv->tx_outstanding;
+   if (netif_queue_stopped(dev))
+   netif_wake_queue(dev);
+   spin_unlock_irqrestore(>lock, flags);
ipoib_dma_unmap_tx(priv, tx_req);
dev_kfree_skb_any(skb);
} else {
dev->trans_start = jiffies;
++tx->tx_head;
-
-   if (++priv->tx_outstanding == ipoib_sendq_size) {
-   ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net 
queue\n",
- tx->qp->qp_num);
-   netif_stop_queue(dev);
-   rc = ib_req_notify_cq(priv->send_cq,
-   IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
-   if (rc < 0)
-   ipoib_warn(priv, "request notify on send CQ 
failed\n");
-   else if (rc)
-   ipoib_send_comp_handler(priv->send_cq, dev);
-   }
}
  }
  
@@ -796,10 +806,13 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)

netif_tx_lock(dev);
  
  	++tx->tx_tail;

+
+   spin_lock_irqsave(>lock, flags);
if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
netif_queue_stopped(dev) &&
test_bit(IPOIB_FLAG_ADMIN_UP, >flags))
netif_wake_queue(dev);
+  

Re: [PATCH] IB/mlx4: correct order of variables in log

2015-10-07 Thread Wengang Wang

Hi,

Any comment on this patch?

thanks,
wengang

在 2015年09月28日 10:08, Wengang Wang 写道:

There is a mis-order in mlx4 log. Fix it.

Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com>
---
  drivers/net/ethernet/mellanox/mlx4/cmd.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 0a32020..150fbb3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1010,7 +1010,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int 
slave,
if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
  smp->method == IB_MGMT_METHOD_GET) || network_view) {
mlx4_err(dev, "Unprivileged slave %d is trying to execute a 
Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. Rejecting\n",
-slave, smp->method, smp->mgmt_class,
+slave, smp->mgmt_class, smp->method,
 network_view ? "Network" : "Host",
 be16_to_cpu(smp->attr_id));
return -EPERM;


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/mlx4: correct order of variables in log

2015-10-07 Thread Wengang Wang

Thanks Or.
I will resend the revised(title) the patch with your Ack.

thanks,
wengang

在 2015年10月08日 12:52, Or Gerlitz 写道:

On 9/28/2015 5:08 AM, Wengang Wang wrote:

There is a mis-order in mlx4 log. Fix it.

Signed-off-by: Wengang Wang<wen.gang.w...@oracle.com>

I wanted to ack it, but wait...

We want commits to our driver to start with Capital letter so please
resubmit with this  title

IB/mlx4: Use correct order of variables in log message

You can add Acked-by: Or Gerlitz <ogerl...@mellanox.com>


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2] IB/mlx4: Use correct order of variables in log message

2015-10-07 Thread Wengang Wang
There is a mis-order in mlx4 log. Fix it.

Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com>
Acked-by: Or Gerlitz <ogerl...@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 0a32020..150fbb3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1010,7 +1010,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int 
slave,
if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
  smp->method == IB_MGMT_METHOD_GET) || network_view) {
mlx4_err(dev, "Unprivileged slave %d is trying to 
execute a Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. 
Rejecting\n",
-slave, smp->method, smp->mgmt_class,
+slave, smp->mgmt_class, smp->method,
 network_view ? "Network" : "Host",
 be16_to_cpu(smp->attr_id));
return -EPERM;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] IPoIB: serialize changing on tx_outstanding

2015-09-27 Thread Wengang Wang
The changing on tx_outstanding should be protected by spinlock or to be
atomic operations.

Such log is found in dmesg:

Sep 16 14:20:53 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034733, 
tx_tail 1034733, tx_outstanding 359 ipoib_sendq_size: 512
Sep 16 14:21:33 naep11x06 kernel: ib0: transmit timeout: latency 9560 msecs
Sep 16 14:21:33 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034854, 
tx_tail 1034854, tx_outstanding 511 ipoib_sendq_size: 512
Sep 16 14:21:38 naep11x06 kernel: ib0: transmit timeout: latency 14568 msecs
Sep 16 14:21:38 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034854, 
tx_tail 1034854, tx_outstanding 511 ipoib_sendq_size: 512

And the send queue of ib0 kept full. When transmit timeout is reported,
queue is reported as "stopped", but the IPoIB stuff tx_head and tx_tail
points to same value. I am not able to see such numbers in ipoib_cm_tx
(for CM) because I have no vmcore. Though I am not quite sure it's caused
by parallel access of tx_outstanding(send path VS interrup path), we really
need to serialize the changeing on tx_outstanding.

This patch also make sure the increase of tx_outstanding prior to the
calling of post_send to avoid the possible decreasing before increasing in
case the running of increasing is scheduled later than the interrupt
handler.

Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com>
---
 drivers/infiniband/ulp/ipoib/ipoib_cm.c | 40 +++--
 drivers/infiniband/ulp/ipoib/ipoib_ib.c | 24 ++--
 2 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index c78dc16..044da94 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -710,6 +710,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
*skb, struct ipoib_cm_
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_tx_buf *tx_req;
int rc;
+   unsigned long flags;
 
if (unlikely(skb->len > tx->mtu)) {
ipoib_warn(priv, "packet len %d (> %d) too long to send, 
dropping\n",
@@ -742,27 +743,36 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff 
*skb, struct ipoib_cm_
skb_orphan(skb);
skb_dst_drop(skb);
 
+   spin_lock_irqsave(>lock, flags);
+   if (++priv->tx_outstanding == ipoib_sendq_size) {
+   ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net 
queue\n",
+ tx->qp->qp_num);
+   netif_stop_queue(dev);
+   }
+   spin_unlock_irqrestore(>lock, flags);
+   if (netif_queue_stopped(dev)) {
+   rc = ib_req_notify_cq(priv->send_cq,
+   IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+   if (rc < 0)
+   ipoib_warn(priv, "request notify on send CQ failed\n");
+   else if (rc)
+   ipoib_send_comp_handler(priv->send_cq, dev);
+   }
+
rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req);
if (unlikely(rc)) {
ipoib_warn(priv, "post_send failed, error %d\n", rc);
++dev->stats.tx_errors;
+   spin_lock_irqsave(>lock, flags);
+   --priv->tx_outstanding;
+   if (netif_queue_stopped(dev))
+   netif_wake_queue(dev);
+   spin_unlock_irqrestore(>lock, flags);
ipoib_dma_unmap_tx(priv, tx_req);
dev_kfree_skb_any(skb);
} else {
dev->trans_start = jiffies;
++tx->tx_head;
-
-   if (++priv->tx_outstanding == ipoib_sendq_size) {
-   ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net 
queue\n",
- tx->qp->qp_num);
-   netif_stop_queue(dev);
-   rc = ib_req_notify_cq(priv->send_cq,
-   IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
-   if (rc < 0)
-   ipoib_warn(priv, "request notify on send CQ 
failed\n");
-   else if (rc)
-   ipoib_send_comp_handler(priv->send_cq, dev);
-   }
}
 }
 
@@ -796,10 +806,13 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct 
ib_wc *wc)
netif_tx_lock(dev);
 
++tx->tx_tail;
+
+   spin_lock_irqsave(>lock, flags);
if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
netif_queue_stopped(dev) &&
test_bit(IPOIB_FLAG_ADMIN_UP, >flags))
netif_wake_queue(dev);
+   spin_unlock_irqrestore(>lock, flags);
 
if (wc->status != IB_

[PATCH] IB/mlx4: correct order of variables in log

2015-09-27 Thread Wengang Wang
There is a mis-order in mlx4 log. Fix it.

Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 0a32020..150fbb3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1010,7 +1010,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int 
slave,
if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED &&
  smp->method == IB_MGMT_METHOD_GET) || network_view) {
mlx4_err(dev, "Unprivileged slave %d is trying to 
execute a Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. 
Rejecting\n",
-slave, smp->method, smp->mgmt_class,
+slave, smp->mgmt_class, smp->method,
 network_view ? "Network" : "Host",
 be16_to_cpu(smp->attr_id));
return -EPERM;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] IB/mlx4: Use vmalloc for WR buffers when needed

2015-09-24 Thread Wengang Wang
There are several hits that WR buffer allocation(kmalloc) failed.
It failed at order 3 and/or 4 contigous pages allocation. At the same time
there are actually 100MB+ free memory but well fragmented.
So try vmalloc when kmalloc failed.

Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com>
---
 drivers/infiniband/hw/mlx4/qp.c  | 20 ++--
 drivers/infiniband/hw/mlx4/srq.c | 11 ---
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 4ad9be3..f152d8a 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -786,8 +787,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, 
struct ib_pd *pd,
if (err)
goto err_mtt;
 
-   qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp);
-   qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp);
+   qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp);
+   if (!qp->sq.wrid)
+   qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64),
+   gfp, PAGE_KERNEL);
+   qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp);
+   if (!qp->rq.wrid)
+   qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64),
+   gfp, PAGE_KERNEL);
if (!qp->sq.wrid || !qp->rq.wrid) {
err = -ENOMEM;
goto err_wrid;
@@ -874,8 +881,8 @@ err_wrid:
if (qp_has_rq(init_attr))

mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db);
} else {
-   kfree(qp->sq.wrid);
-   kfree(qp->rq.wrid);
+   kvfree(qp->sq.wrid);
+   kvfree(qp->rq.wrid);
}
 
 err_mtt:
@@ -1050,8 +1057,9 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, 
struct mlx4_ib_qp *qp,
  >db);
ib_umem_release(qp->umem);
} else {
-   kfree(qp->sq.wrid);
-   kfree(qp->rq.wrid);
+   kvfree(qp->sq.wrid);
+   kvfree(qp->rq.wrid);
+
if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
free_proxy_bufs(>ib_dev, qp);
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index dce5dfe..8d133c4 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "mlx4_ib.h"
 #include "user.h"
@@ -172,8 +173,12 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 
srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
if (!srq->wrid) {
-   err = -ENOMEM;
-   goto err_mtt;
+   srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64),
+ GFP_KERNEL, PAGE_KERNEL);
+   if (!srq->wrid) {
+   err = -ENOMEM;
+   goto err_mtt;
+   }
}
}
 
@@ -204,7 +209,7 @@ err_wrid:
if (pd->uobject)
mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), 
>db);
else
-   kfree(srq->wrid);
+   kvfree(srq->wrid);
 
 err_mtt:
mlx4_mtt_cleanup(dev->dev, >mtt);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/mlx4: Use vmalloc for WR buffers when needed

2015-09-24 Thread Wengang Wang


在 2015年09月24日 16:07, Or Gerlitz 写道:

On 9/24/2015 9:52 AM, Wengang Wang wrote:

  } else {
-kfree(qp->sq.wrid);
-kfree(qp->rq.wrid);
+if (is_vmalloc_addr(qp->sq.wrid))
+vfree(qp->sq.wrid);
+else
+kfree(qp->sq.wrid);
+
+if (is_vmalloc_addr(qp->rq.wrid))
+vfree(qp->rq.wrid);
+else
+kfree(qp->rq.wrid);
  }


NO

just call kvfree, see commit 914efb0 "mlx4: don't duplicate kvfree()"


Yeap, will re-post.

thanks,
wengang

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] mlx4: vmalloc for mlx4_ib_wq.wrid and mlx4_ib_srq.wrid

2015-09-24 Thread Wengang Wang

Hi Or,

在 2015年09月24日 13:33, Or Gerlitz 写道:

On 9/24/2015 5:10 AM, Wengang Wang wrote:
Use __vmalloc to allocate memory for mlx4_ib_wq.wrid and 
mlx4_ib_srq.wrid.


Several hits that the kmalloc for wrid failed with the following like
call back stack:


Using vmalloc and friends should be done with care, specifically, we'd 
like
to go there only when needed (namely when kmalloc fails), else we can 
get into

another set of troubles.

Please use the practice introduced in commit 89dd86d "mlx4_core: Allow 
large mlx4_buddy bitmaps"
to go the vmalloc way only when needed, note you can just call kvfree 
later, no need to branch
when freeing things (as was fixed later in commit 914efb0 "mlx4: don't 
duplicate kvfree()")


And have the patch title to be

IB/mlx4: Use vmalloc  for WR buffers when needed

I don't think the OOM oops in the change log helps, skip it.



OK, will look at commit 89dd86d and try again.

thanks for so quick response.

thanks
wengang

Or.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] IB/mlx4: Use vmalloc for WR buffers when needed

2015-09-24 Thread Wengang Wang
There are several hits that WR buffer allocation(kmalloc) failed.
It failed at order 3 and/or 4 contigous pages allocation. At the same time
there are actually 100MB+ free memory but well fragmented.
So try vmalloc when kmalloc failed.

Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com>
---
 drivers/infiniband/hw/mlx4/qp.c  | 34 --
 drivers/infiniband/hw/mlx4/srq.c | 14 +++---
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 4ad9be3..da551e1 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -786,8 +787,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, 
struct ib_pd *pd,
if (err)
goto err_mtt;
 
-   qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp);
-   qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp);
+   qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp);
+   if (!qp->sq.wrid)
+   qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64),
+   gfp, PAGE_KERNEL);
+   qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp);
+   if (!qp->rq.wrid)
+   qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64),
+   gfp, PAGE_KERNEL);
if (!qp->sq.wrid || !qp->rq.wrid) {
err = -ENOMEM;
goto err_wrid;
@@ -874,8 +881,15 @@ err_wrid:
if (qp_has_rq(init_attr))

mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db);
} else {
-   kfree(qp->sq.wrid);
-   kfree(qp->rq.wrid);
+   if (is_vmalloc_addr(qp->sq.wrid))
+   vfree(qp->sq.wrid);
+   else
+   kfree(qp->sq.wrid);
+
+   if (is_vmalloc_addr(qp->rq.wrid))
+   vfree(qp->rq.wrid);
+   else
+   kfree(qp->rq.wrid);
}
 
 err_mtt:
@@ -1050,8 +1064,16 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, 
struct mlx4_ib_qp *qp,
  >db);
ib_umem_release(qp->umem);
} else {
-   kfree(qp->sq.wrid);
-   kfree(qp->rq.wrid);
+   if (is_vmalloc_addr(qp->sq.wrid))
+   vfree(qp->sq.wrid);
+   else
+   kfree(qp->sq.wrid);
+
+   if (is_vmalloc_addr(qp->rq.wrid))
+   vfree(qp->rq.wrid);
+   else
+   kfree(qp->rq.wrid);
+
if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
free_proxy_bufs(>ib_dev, qp);
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index dce5dfe..d0e8d40 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "mlx4_ib.h"
 #include "user.h"
@@ -172,8 +173,12 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 
srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
if (!srq->wrid) {
-   err = -ENOMEM;
-   goto err_mtt;
+   srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64),
+ GFP_KERNEL, PAGE_KERNEL);
+   if (!srq->wrid) {
+   err = -ENOMEM;
+   goto err_mtt;
+   }
}
}
 
@@ -204,7 +209,10 @@ err_wrid:
if (pd->uobject)
mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), 
>db);
else
-   kfree(srq->wrid);
+   if (is_vmalloc_addr(srq->wrid))
+   vfree(srq->wrid);
+   else
+   kfree(srq->wrid);
 
 err_mtt:
mlx4_mtt_cleanup(dev->dev, >mtt);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/mlx4: Use vmalloc for WR buffers when needed

2015-09-24 Thread Wengang Wang

Hi Or,

在 2015年09月24日 19:57, Or Gerlitz 写道:

On Thu, Sep 24, 2015 at 1:49 PM, Wengang Wang <wen.gang.w...@oracle.com> wrote:

@@ -786,8 +787,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, 
struct ib_pd *pd,
 if (err)
 goto err_mtt;

-   qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp);
-   qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp);
+   qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp);
+   if (!qp->sq.wrid)
+   qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64),
+   gfp, PAGE_KERNEL);

On other spots of mlx4, we're using vmalloc and not __vmalloc, any
pros/cons for going that way too here?


vmalloc is just using GFP_KERNEL | __GFP_HIGHMEM, we can't pass in the 
flag gfp with it.  We should respect orginal code which needs to pass 
the flag.


thanks,
wengang

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] mlx4: vmalloc for mlx4_ib_wq.wrid and mlx4_ib_srq.wrid

2015-09-23 Thread Wengang Wang
Use __vmalloc to allocate memory for mlx4_ib_wq.wrid and mlx4_ib_srq.wrid.

Several hits that the kmalloc for wrid failed with the following like
call back stack:

kworker/u:4: page allocation failure: order:4, mode:0x2000d0
Pid: 16388, comm: kworker/u:4 Not tainted
Call Trace:
 [] warn_alloc_failed+0xf3/0x160
 [] ? __alloc_pages_direct_compact+0x1fa/0x200
 [] __alloc_pages_slowpath+0x4a6/0x7b0
 [] __alloc_pages_nodemask+0x2fb/0x320
 [] kmem_getpages+0x67/0x1c0
 [] fallback_alloc+0x187/0x250
 [] cache_alloc_node+0x9a/0x150
 [] __kmalloc+0x18b/0x340
 [] ? create_qp_common+0x431/0x8e0 [mlx4_ib]
 [] create_qp_common+0x431/0x8e0 [mlx4_ib]
 [] ? kzalloc.clone.1+0xe/0x10 [mlx4_ib]
 [] mlx4_ib_create_qp+0x207/0x310 [mlx4_ib]
 [] ib_create_qp+0x41/0x1c0 [ib_core]
 [] ipoib_cm_create_tx_qp+0xc8/0x130 [ib_ipoib]
 [] ? __vmalloc_node+0x35/0x40
 [] ipoib_cm_tx_init+0x65/0x380 [ib_ipoib]
 [] ? sched_clock_cpu+0xcd/0x110
 [] ? xen_mc_flush+0xb0/0x1b0
 [] ipoib_cm_tx_start+0x230/0x3d0 [ib_ipoib]
 [] process_one_work+0x180/0x420
 [] worker_thread+0x12e/0x390
 [] ? manage_workers+0x180/0x180
 [] kthread+0xce/0xe0
 [] ? xen_end_context_switch+0x1e/0x30
 [] ? kthread_freezable_should_stop+0x70/0x70
 [] ret_from_fork+0x7c/0xb0
 [] ? kthread_freezable_should_stop+0x70/0x70

It needs 16 contigous pages and failed. At the time there actually is
100MB+ free memory:

Node 0 Normal: 10268*4kB (UM) 7443*8kB (UEM) 1647*16kB (UM) 35*32kB (UR)
1*64kB (R) 4*128kB (R) 1*256kB (R) 0*512kB 1*1024kB (R) 0*2048kB 0*4096kB =
129944kB

I also hit same errors order 3.

Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com>
---
 drivers/infiniband/hw/mlx4/qp.c  | 15 +--
 drivers/infiniband/hw/mlx4/srq.c |  6 --
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 4ad9be3..754ceb9 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -786,8 +787,10 @@ static int create_qp_common(struct mlx4_ib_dev *dev, 
struct ib_pd *pd,
if (err)
goto err_mtt;
 
-   qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp);
-   qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp);
+   qp->sq.wrid  = __vmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp,
+PAGE_KERNEL);
+   qp->rq.wrid  = __vmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp,
+PAGE_KERNEL);
if (!qp->sq.wrid || !qp->rq.wrid) {
err = -ENOMEM;
goto err_wrid;
@@ -874,8 +877,8 @@ err_wrid:
if (qp_has_rq(init_attr))

mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db);
} else {
-   kfree(qp->sq.wrid);
-   kfree(qp->rq.wrid);
+   vfree(qp->sq.wrid);
+   vfree(qp->rq.wrid);
}
 
 err_mtt:
@@ -1050,8 +1053,8 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, 
struct mlx4_ib_qp *qp,
  >db);
ib_umem_release(qp->umem);
} else {
-   kfree(qp->sq.wrid);
-   kfree(qp->rq.wrid);
+   vfree(qp->sq.wrid);
+   vfree(qp->rq.wrid);
if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
free_proxy_bufs(>ib_dev, qp);
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index dce5dfe..6d21bb2 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "mlx4_ib.h"
 #include "user.h"
@@ -170,7 +171,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
if (err)
goto err_mtt;
 
-   srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
+   srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL,
+ PAGE_KERNEL);
if (!srq->wrid) {
err = -ENOMEM;
goto err_mtt;
@@ -204,7 +206,7 @@ err_wrid:
if (pd->uobject)
mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), 
>db);
else
-   kfree(srq->wrid);
+   vfree(srq->wrid);
 
 err_mtt:
mlx4_mtt_cleanup(dev->dev, >mtt);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] rds: rds_ib_device.refcount overflow

2015-07-29 Thread Wengang Wang

Doug,

No problem. I found the patch picked up.

thanks,
wengang

在 2015年07月29日 22:36, Doug Ledford 写道:

On 07/12/2015 09:18 PM, Wengang Wang wrote:

Hi Doug,

How do you think about this patch?

Sorry, I picked this up already.  I must have missed sending out the
acknowledgment on this one.


thanks,
wengang

在 2015年07月06日 14:35, Wengang Wang 写道:

Fixes: 3e0249f9c05c (RDS/IB: add refcount tracking to struct
rds_ib_device)

There lacks a dropping on rds_ib_device.refcount in case rds_ib_alloc_fmr
failed(mr pool running out). this lead to the refcount overflow.

A complain in line 117(see following) is seen. From vmcore:
s_ib_rdma_mr_pool_depleted is 2147485544 and rds_ibdev-refcount is
-2147475448.
That is the evidence the mr pool is used up. so rds_ib_alloc_fmr is
very likely
to return ERR_PTR(-EAGAIN).

115 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
116 {
117 BUG_ON(atomic_read(rds_ibdev-refcount) = 0);
118 if (atomic_dec_and_test(rds_ibdev-refcount))
119 queue_work(rds_wq, rds_ibdev-free_work);
120 }

fix is to drop refcount when rds_ib_alloc_fmr failed.

Signed-off-by: Wengang Wang wen.gang.w...@oracle.com
Reviewed-by: Haggai Eran hagg...@mellanox.com
---
   net/rds/ib_rdma.c | 4 +++-
   1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 273b8bf..657ba9f 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -759,8 +759,10 @@ void *rds_ib_get_mr(struct scatterlist *sg,
unsigned long nents,
   }
 ibmr = rds_ib_alloc_fmr(rds_ibdev);
-if (IS_ERR(ibmr))
+if (IS_ERR(ibmr)) {
+rds_ib_dev_put(rds_ibdev);
   return ibmr;
+}
 ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
   if (ret == 0)

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html




--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] rds: rds_ib_device.refcount overflow

2015-07-12 Thread Wengang Wang

Hi Doug,

How do you think about this patch?

thanks,
wengang

在 2015年07月06日 14:35, Wengang Wang 写道:

Fixes: 3e0249f9c05c (RDS/IB: add refcount tracking to struct rds_ib_device)

There lacks a dropping on rds_ib_device.refcount in case rds_ib_alloc_fmr
failed(mr pool running out). this lead to the refcount overflow.

A complain in line 117(see following) is seen. From vmcore:
s_ib_rdma_mr_pool_depleted is 2147485544 and rds_ibdev-refcount is -2147475448.
That is the evidence the mr pool is used up. so rds_ib_alloc_fmr is very likely
to return ERR_PTR(-EAGAIN).

115 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
116 {
117 BUG_ON(atomic_read(rds_ibdev-refcount) = 0);
118 if (atomic_dec_and_test(rds_ibdev-refcount))
119 queue_work(rds_wq, rds_ibdev-free_work);
120 }

fix is to drop refcount when rds_ib_alloc_fmr failed.

Signed-off-by: Wengang Wang wen.gang.w...@oracle.com
Reviewed-by: Haggai Eran hagg...@mellanox.com
---
  net/rds/ib_rdma.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 273b8bf..657ba9f 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -759,8 +759,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long 
nents,
}
  
  	ibmr = rds_ib_alloc_fmr(rds_ibdev);

-   if (IS_ERR(ibmr))
+   if (IS_ERR(ibmr)) {
+   rds_ib_dev_put(rds_ibdev);
return ibmr;
+   }
  
  	ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);

if (ret == 0)


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] rds: rds_ib_device.refcount overflow

2015-07-06 Thread Wengang Wang

Haggai,

Thanks for review! I will add the message you suggested and re-post.

thanks,
wengang

在 2015年07月06日 14:18, Haggai Eran 写道:

On 24/06/2015 07:54, Wengang Wang wrote:

There lacks a dropping on rds_ib_device.refcount in case rds_ib_alloc_fmr
failed(mr pool running out). this lead to the refcount overflow.

A complain in line 117(see following) is seen. From vmcore:
s_ib_rdma_mr_pool_depleted is 2147485544 and rds_ibdev-refcount is -2147475448.
That is the evidence the mr pool is used up. so rds_ib_alloc_fmr is very likely
to return ERR_PTR(-EAGAIN).

115 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
116 {
117 BUG_ON(atomic_read(rds_ibdev-refcount) = 0);
118 if (atomic_dec_and_test(rds_ibdev-refcount))
119 queue_work(rds_wq, rds_ibdev-free_work);
120 }

fix is to drop refcount when rds_ib_alloc_fmr failed.

Signed-off-by: Wengang Wang wen.gang.w...@oracle.com
---
  net/rds/ib_rdma.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 273b8bf..657ba9f 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -759,8 +759,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long 
nents,
}
  
  	ibmr = rds_ib_alloc_fmr(rds_ibdev);

-   if (IS_ERR(ibmr))
+   if (IS_ERR(ibmr)) {
+   rds_ib_dev_put(rds_ibdev);
return ibmr;
+   }
  
  	ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);

if (ret == 0)


It seems like the function indeed is missing a put on the rds_ibdev in
that case.

Reviewed-by: Haggai Eran hagg...@mellanox.com

You may also want to add:
Fixes: 3e0249f9c05c (RDS/IB: add refcount tracking to struct
rds_ib_device)


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] rds: rds_ib_device.refcount overflow

2015-07-06 Thread Wengang Wang
Fixes: 3e0249f9c05c (RDS/IB: add refcount tracking to struct rds_ib_device)

There lacks a dropping on rds_ib_device.refcount in case rds_ib_alloc_fmr
failed(mr pool running out). this lead to the refcount overflow.

A complain in line 117(see following) is seen. From vmcore:
s_ib_rdma_mr_pool_depleted is 2147485544 and rds_ibdev-refcount is -2147475448.
That is the evidence the mr pool is used up. so rds_ib_alloc_fmr is very likely
to return ERR_PTR(-EAGAIN).

115 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
116 {
117 BUG_ON(atomic_read(rds_ibdev-refcount) = 0);
118 if (atomic_dec_and_test(rds_ibdev-refcount))
119 queue_work(rds_wq, rds_ibdev-free_work);
120 }

fix is to drop refcount when rds_ib_alloc_fmr failed.

Signed-off-by: Wengang Wang wen.gang.w...@oracle.com
Reviewed-by: Haggai Eran hagg...@mellanox.com
---
 net/rds/ib_rdma.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 273b8bf..657ba9f 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -759,8 +759,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long 
nents,
}
 
ibmr = rds_ib_alloc_fmr(rds_ibdev);
-   if (IS_ERR(ibmr))
+   if (IS_ERR(ibmr)) {
+   rds_ib_dev_put(rds_ibdev);
return ibmr;
+   }
 
ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
if (ret == 0)
-- 
2.1.0

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] rds: rds_ib_device.refcount overflow

2015-07-05 Thread Wengang Wang

Hi Doug,

Could you please review this patch?

thanks,
wengang

在 2015年06月24日 12:54, Wengang Wang 写道:

There lacks a dropping on rds_ib_device.refcount in case rds_ib_alloc_fmr
failed(mr pool running out). this lead to the refcount overflow.

A complain in line 117(see following) is seen. From vmcore:
s_ib_rdma_mr_pool_depleted is 2147485544 and rds_ibdev-refcount is -2147475448.
That is the evidence the mr pool is used up. so rds_ib_alloc_fmr is very likely
to return ERR_PTR(-EAGAIN).

115 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
116 {
117 BUG_ON(atomic_read(rds_ibdev-refcount) = 0);
118 if (atomic_dec_and_test(rds_ibdev-refcount))
119 queue_work(rds_wq, rds_ibdev-free_work);
120 }

fix is to drop refcount when rds_ib_alloc_fmr failed.

Signed-off-by: Wengang Wang wen.gang.w...@oracle.com
---
  net/rds/ib_rdma.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index 273b8bf..657ba9f 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -759,8 +759,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long 
nents,
}
  
  	ibmr = rds_ib_alloc_fmr(rds_ibdev);

-   if (IS_ERR(ibmr))
+   if (IS_ERR(ibmr)) {
+   rds_ib_dev_put(rds_ibdev);
return ibmr;
+   }
  
  	ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);

if (ret == 0)


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] rds: re-entry of rds_ib_xmit/rds_iw_xmit

2015-05-24 Thread Wengang Wang

Hi,
Could anyone review this patch please.

thanks,
wengang
在 2015年05月21日 13:11, Wengang Wang 写道:

The BUG_ON at line 452/453 is triggered in function rds_send_xmit.

  441 while (ret) {
  442 tmp = min_t(int, ret, sg-length -
  443   
conn-c_xmit_data_off);
  444 conn-c_xmit_data_off += tmp;
  445 ret -= tmp;
  446 if (conn-c_xmit_data_off == sg-length) {
  447 conn-c_xmit_data_off = 0;
  448 sg++;
  449 conn-c_xmit_sg++;
  450 if (ret != 0  conn-c_xmit_sg == 
rm-data.op_nents)
  451 printk(KERN_ERR conn %p rm %p 
sg %p ret %d\n, conn, rm, sg, ret);
  452 BUG_ON(ret != 0 
  453conn-c_xmit_sg == 
rm-data.op_nents);
  454 }
  455 }

it is complaining the total sent length is bigger that we want to send.

rds_ib_xmit() is wrong for the second entry for the same rds_message returning
wrong value.

the sg and off passed by rds_send_xmit to rds_ib_xmit is based on
scatterlist.offset/length, but the rds_ib_xmit action is based on
scatterlist.dma_address/dma_length. in case dma_length is larger than length
there is problem. for the 2nd and later entries of rds_ib_xmit for same
rds_message, at least one of the following two is wrong:

1) the scatterlist to start with,  the choosen one can far beyond the correct
one.
2) the offset to start with within the scatterlist.

fix:
add op_dmasg and op_dmaoff to rm_data_op structure indicating the scatterlist
and offset within the it to start with for rds_ib_xmit respectively. op_dmasg
and op_dmaoff are initialized to zero when doing dma mapping for the first see
of the message and are changed when filling send slots.

the same applies to rds_iw_xmit too.

Signed-off-by: Wengang Wang wen.gang.w...@oracle.com
---
  net/rds/ib_send.c | 17 +++--
  net/rds/iw_send.c | 18 +++---
  net/rds/rds.h |  2 ++
  3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index bd3825d..1df6c84 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -605,6 +605,8 @@ int rds_ib_xmit(struct rds_connection *conn, struct 
rds_message *rm,
}
  
  		rds_message_addref(rm);

+   rm-data.op_dmasg = 0;
+   rm-data.op_dmaoff = 0;
ic-i_data_op = rm-data;
  
  		/* Finalize the header */

@@ -658,7 +660,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct 
rds_message *rm,
send = ic-i_sends[pos];
first = send;
prev = NULL;
-   scat = ic-i_data_op-op_sg[sg];
+   scat = ic-i_data_op-op_sg[rm-data.op_dmasg];
i = 0;
do {
unsigned int len = 0;
@@ -680,17 +682,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct 
rds_message *rm,
/* Set up the data, if present */
if (i  work_alloc
 scat != rm-data.op_sg[rm-data.op_count]) {
-   len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - 
off);
+   len = min(RDS_FRAG_SIZE,
+   ib_sg_dma_len(dev, scat) - rm-data.op_dmaoff);
send-s_wr.num_sge = 2;
  
-			send-s_sge[1].addr = ib_sg_dma_address(dev, scat) + off;

+   send-s_sge[1].addr = ib_sg_dma_address(dev, scat);
+   send-s_sge[1].addr += rm-data.op_dmaoff;
send-s_sge[1].length = len;
  
  			bytes_sent += len;

-   off += len;
-   if (off == ib_sg_dma_len(dev, scat)) {
+   rm-data.op_dmaoff += len;
+   if (rm-data.op_dmaoff == ib_sg_dma_len(dev, scat)) {
scat++;
-   off = 0;
+   rm-data.op_dmasg++;
+   rm-data.op_dmaoff = 0;
}
}
  
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c

index 1383478..334fe98 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -581,6 +581,8 @@ int rds_iw_xmit(struct rds_connection *conn, struct 
rds_message *rm,
ic-i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
ic-i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
rds_message_addref(rm);
+   rm-data.op_dmasg = 0;
+   rm-data.op_dmaoff = 0;
ic-i_rm = rm;
  
  		/* Finalize the header */

@@ -622,7 +624,7 @@ int rds_iw_xmit(struct rds_connection *conn

[PATCH] rds: re-entry of rds_ib_xmit/rds_iw_xmit

2015-05-20 Thread Wengang Wang
The BUG_ON at line 452/453 is triggered in function rds_send_xmit.

 441 while (ret) {
 442 tmp = min_t(int, ret, sg-length -
 443   
conn-c_xmit_data_off);
 444 conn-c_xmit_data_off += tmp;
 445 ret -= tmp;
 446 if (conn-c_xmit_data_off == sg-length) {
 447 conn-c_xmit_data_off = 0;
 448 sg++;
 449 conn-c_xmit_sg++;
 450 if (ret != 0  conn-c_xmit_sg == 
rm-data.op_nents)
 451 printk(KERN_ERR conn %p 
rm %p sg %p ret %d\n, conn, rm, sg, ret);
 452 BUG_ON(ret != 0 
 453conn-c_xmit_sg == 
rm-data.op_nents);
 454 }
 455 }

it is complaining the total sent length is bigger that we want to send.

rds_ib_xmit() is wrong for the second entry for the same rds_message returning
wrong value.

the sg and off passed by rds_send_xmit to rds_ib_xmit is based on
scatterlist.offset/length, but the rds_ib_xmit action is based on
scatterlist.dma_address/dma_length. in case dma_length is larger than length
there is problem. for the 2nd and later entries of rds_ib_xmit for same
rds_message, at least one of the following two is wrong:

1) the scatterlist to start with,  the choosen one can far beyond the correct
   one.
2) the offset to start with within the scatterlist.

fix:
add op_dmasg and op_dmaoff to rm_data_op structure indicating the scatterlist
and offset within the it to start with for rds_ib_xmit respectively. op_dmasg
and op_dmaoff are initialized to zero when doing dma mapping for the first see
of the message and are changed when filling send slots.

the same applies to rds_iw_xmit too.

Signed-off-by: Wengang Wang wen.gang.w...@oracle.com
---
 net/rds/ib_send.c | 17 +++--
 net/rds/iw_send.c | 18 +++---
 net/rds/rds.h |  2 ++
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index bd3825d..1df6c84 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -605,6 +605,8 @@ int rds_ib_xmit(struct rds_connection *conn, struct 
rds_message *rm,
}
 
rds_message_addref(rm);
+   rm-data.op_dmasg = 0;
+   rm-data.op_dmaoff = 0;
ic-i_data_op = rm-data;
 
/* Finalize the header */
@@ -658,7 +660,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct 
rds_message *rm,
send = ic-i_sends[pos];
first = send;
prev = NULL;
-   scat = ic-i_data_op-op_sg[sg];
+   scat = ic-i_data_op-op_sg[rm-data.op_dmasg];
i = 0;
do {
unsigned int len = 0;
@@ -680,17 +682,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct 
rds_message *rm,
/* Set up the data, if present */
if (i  work_alloc
 scat != rm-data.op_sg[rm-data.op_count]) {
-   len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - 
off);
+   len = min(RDS_FRAG_SIZE,
+   ib_sg_dma_len(dev, scat) - rm-data.op_dmaoff);
send-s_wr.num_sge = 2;
 
-   send-s_sge[1].addr = ib_sg_dma_address(dev, scat) + 
off;
+   send-s_sge[1].addr = ib_sg_dma_address(dev, scat);
+   send-s_sge[1].addr += rm-data.op_dmaoff;
send-s_sge[1].length = len;
 
bytes_sent += len;
-   off += len;
-   if (off == ib_sg_dma_len(dev, scat)) {
+   rm-data.op_dmaoff += len;
+   if (rm-data.op_dmaoff == ib_sg_dma_len(dev, scat)) {
scat++;
-   off = 0;
+   rm-data.op_dmasg++;
+   rm-data.op_dmaoff = 0;
}
}
 
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
index 1383478..334fe98 100644
--- a/net/rds/iw_send.c
+++ b/net/rds/iw_send.c
@@ -581,6 +581,8 @@ int rds_iw_xmit(struct rds_connection *conn, struct 
rds_message *rm,
ic-i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
ic-i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
rds_message_addref(rm);
+   rm-data.op_dmasg = 0;
+   rm-data.op_dmaoff = 0;
ic-i_rm = rm;
 
/* Finalize the header */
@@ -622,7 +624,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct 
rds_message *rm,
send = ic-i_sends[pos

[PATCH] bonding: move ipoib_header_ops to vmlinux

2014-12-29 Thread Wengang Wang
When last slave of a bonding master is removed, the bonding then does not work.
At the time if packet_snd is called against with a master net_device, it calls
then header_ops-create which points to slave's header_ops. In case the slave
is ipoib and the module is unloaded, header_ops would point to invalid address.
Accessing it will cause problem.
This patch tries to fix this issue by moving ipoib_header_ops to vmlinux to keep
it valid even when ipoib module is unloaded.

Signed-off-by: Wengang Wang wen.gang.w...@oracle.com
---
 drivers/infiniband/ulp/ipoib/ipoib.h  | 10 -
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 28 +
 include/linux/ibdevice.h  | 15 ++
 include/linux/if_infiniband.h | 11 ++
 include/uapi/linux/if_infiniband.h| 16 ---
 net/Makefile  |  2 +-
 net/infiniband/Makefile   |  5 +
 net/infiniband/infiniband.c   | 34 +++
 8 files changed, 80 insertions(+), 41 deletions(-)
 create mode 100644 include/linux/ibdevice.h
 create mode 100644 include/linux/if_infiniband.h
 create mode 100644 net/infiniband/Makefile
 create mode 100644 net/infiniband/infiniband.c

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
b/drivers/infiniband/ulp/ipoib/ipoib.h
index d7562be..7c25670 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -121,16 +121,6 @@ enum {
 
 /* structs */
 
-struct ipoib_header {
-   __be16  proto;
-   u16 reserved;
-};
-
-struct ipoib_cb {
-   struct qdisc_skb_cb qdisc_cb;
-   u8  hwaddr[INFINIBAND_ALEN];
-};
-
 static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
 {
BUILD_BUG_ON(sizeof(skb-cb)  sizeof(struct ipoib_cb));
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 58b5aa3..9233085 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -34,6 +34,7 @@
 
 #include ipoib.h
 
+#include linux/ibdevice.h
 #include linux/module.h
 
 #include linux/init.h
@@ -807,29 +808,6 @@ static void ipoib_timeout(struct net_device *dev)
/* XXX reset QP, etc. */
 }
 
-static int ipoib_hard_header(struct sk_buff *skb,
-struct net_device *dev,
-unsigned short type,
-const void *daddr, const void *saddr, unsigned len)
-{
-   struct ipoib_header *header;
-   struct ipoib_cb *cb = ipoib_skb_cb(skb);
-
-   header = (struct ipoib_header *) skb_push(skb, sizeof *header);
-
-   header-proto = htons(type);
-   header-reserved = 0;
-
-   /*
-* we don't rely on dst_entry structure,  always stuff the
-* destination address into skb-cb so we can figure out where
-* to send the packet later.
-*/
-   memcpy(cb-hwaddr, daddr, INFINIBAND_ALEN);
-
-   return sizeof *header;
-}
-
 static void ipoib_set_mcast_list(struct net_device *dev)
 {
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -1328,10 +1306,6 @@ void ipoib_dev_cleanup(struct net_device *dev)
ipoib_neigh_hash_uninit(dev);
 }
 
-static const struct header_ops ipoib_header_ops = {
-   .create = ipoib_hard_header,
-};
-
 static const struct net_device_ops ipoib_netdev_ops = {
.ndo_uninit  = ipoib_uninit,
.ndo_open= ipoib_open,
diff --git a/include/linux/ibdevice.h b/include/linux/ibdevice.h
new file mode 100644
index 000..8418974
--- /dev/null
+++ b/include/linux/ibdevice.h
@@ -0,0 +1,15 @@
+/*
+ * ipoib   Implementation of ipoib_header_ops here.
+ *
+ * Authors:Wengang Wang wen.gang.w...@oracle.com
+ */
+#ifndef _LINUX_IBDEVICE_H
+#define _LINUX_IBDEVICE_H
+
+#include linux/netdevice.h
+
+#ifdef __KERNEL__
+extern const struct header_ops ipoib_header_ops;
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_IBDEVICE_H */
diff --git a/include/linux/if_infiniband.h b/include/linux/if_infiniband.h
new file mode 100644
index 000..9f2d0cf
--- /dev/null
+++ b/include/linux/if_infiniband.h
@@ -0,0 +1,11 @@
+/*
+ * ipoib   Implementation of ipoib_header_ops here.
+ *
+ * Authors:Wengang Wang wen.gang.w...@oracle.com
+ */
+#ifndef _LINUX_IF_INFINIBAND_H
+#define _LINUX_IF_INFINIBAND_H
+
+#include uapi/linux/if_infiniband.h
+
+#endif /* _LINUX_IF_INFINIBAND_H */
diff --git a/include/uapi/linux/if_infiniband.h 
b/include/uapi/linux/if_infiniband.h
index 7d958475..9190ee3 100644
--- a/include/uapi/linux/if_infiniband.h
+++ b/include/uapi/linux/if_infiniband.h
@@ -21,9 +21,19 @@
  * $Id$
  */
 
-#ifndef _LINUX_IF_INFINIBAND_H
-#define _LINUX_IF_INFINIBAND_H
+#ifndef _UAPI_LINUX_IF_INFINIBAND_H
+#define _UAPI_LINUX_IF_INFINIBAND_H
 
+#include net/sch_generic.h
 #define INFINIBAND_ALEN20  /* Octets in IPoIB HW addr

Re: [PATCH] bonding: move ipoib_header_ops to vmlinux

2014-12-02 Thread Wengang Wang

Hi David and Jay,

Then about about the change in this patch?

thanks,
wengang

在 2014年11月26日 09:30, Wengang 写道:

于 2014年11月26日 02:44, David Miller 写道:

From: Jay Vosburgh jay.vosbu...@canonical.com
Date: Tue, 25 Nov 2014 10:41:17 -0800


Or Gerlitz ogerl...@mellanox.com wrote:


On 11/25/2014 8:07 AM, David Miller wrote:

IPOIB should not work over bonding as it requires that the device
use ARPHRD_ETHER.

Hi Dave,

IPoIB devices can be enslaved to both bonding and teaming in their 
HA mode,

the bond device type becomes ARPHRD_INFINIBAND when this happens.

The point was that pktgen disallows ARPHRD_INFINIBAND, not that
bonding does.

Pktgen specifically checks for type != ARPHRD_ETHER, so the
IPoIB bond should not be able to be used with pkgten.  My suspicion is
that pktgen is being configured on the bond first, then an IPoIB slave
is added to the bond; this would change its type in a way that pktgen
wouldn't notice.

+1


I think it go this way:

1) bond_master is ready
2) bond_enslave enslave a IPOIB interface calling bond_setup_by_slave
3) then bond_setup_by_slave set change master type to ARPHRD_INFINIBAND.

code is like this:

1 /* enslave device slave to bond device master */
2 int bond_enslave(struct net_device *bond_dev, struct net_device 
*slave_dev)

3 {
4 snip...
5 /* set bonding device ether type by slave - bonding netdevices are
6 * created with ether_setup, so when the slave type is not ARPHRD_ETHER
7 * there is a need to override some of the type dependent attribs/funcs.
8 *
9 * bond ether type mutual exclusion - don't allow slaves of dissimilar
10 * ether type (eg ARPHRD_ETHER and ARPHRD_INFINIBAND) share the same 
bond

11 */
12 if (!bond_has_slaves(bond)) {
13 if (bond_dev-type != slave_dev-type) {
14 snip...
15 if (slave_dev-type != ARPHRD_ETHER)
16 bond_setup_by_slave(bond_dev, slave_dev);
17 else {
18 ether_setup(bond_dev);
19 bond_dev-priv_flags = ~IFF_TX_SKB_SHARING;
20 }
21
22 call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE,
23 bond_dev);
24 }
25 snip...
26 }
27
28 static void bond_setup_by_slave(struct net_device *bond_dev,
29 struct net_device *slave_dev)
30 {
31 bond_dev-header_ops = slave_dev-header_ops;
32
33 bond_dev-type = slave_dev-type;
34 bond_dev-hard_header_len = slave_dev-hard_header_len;
35 bond_dev-addr_len = slave_dev-addr_len;
36
37 memcpy(bond_dev-broadcast, slave_dev-broadcast,
38 slave_dev-addr_len);
39 }
40

thanks
wengang
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] bonding: move ipoib_header_ops to vmlinux

2014-11-24 Thread Wengang Wang
When last slave of a bonding master is removed, the bonding then does not work.
At the time if packet_snd is called against with a master net_device, it calls
then header_ops-create which points to slave's header_ops. In case the slave
is ipoib and the module is unloaded, header_ops would point to invalid address.
Accessing it will cause problem.
This patch tries to fix this issue by moving ipoib_header_ops to vmlinux to keep
it valid even when ipoib module is unloaded.

Signed-off-by: Wengang Wang wen.gang.w...@oracle.com
---
 drivers/infiniband/ulp/ipoib/ipoib.h  | 10 -
 drivers/infiniband/ulp/ipoib/ipoib_main.c | 28 +
 include/linux/ibdevice.h  | 15 ++
 include/linux/if_infiniband.h | 11 ++
 include/uapi/linux/if_infiniband.h| 16 ---
 net/Makefile  |  2 +-
 net/infiniband/Makefile   |  5 +
 net/infiniband/infiniband.c   | 34 +++
 8 files changed, 80 insertions(+), 41 deletions(-)
 create mode 100644 include/linux/ibdevice.h
 create mode 100644 include/linux/if_infiniband.h
 create mode 100644 net/infiniband/Makefile
 create mode 100644 net/infiniband/infiniband.c

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
b/drivers/infiniband/ulp/ipoib/ipoib.h
index d7562be..7c25670 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -121,16 +121,6 @@ enum {
 
 /* structs */
 
-struct ipoib_header {
-   __be16  proto;
-   u16 reserved;
-};
-
-struct ipoib_cb {
-   struct qdisc_skb_cb qdisc_cb;
-   u8  hwaddr[INFINIBAND_ALEN];
-};
-
 static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
 {
BUILD_BUG_ON(sizeof(skb-cb)  sizeof(struct ipoib_cb));
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 58b5aa3..9233085 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -34,6 +34,7 @@
 
 #include ipoib.h
 
+#include linux/ibdevice.h
 #include linux/module.h
 
 #include linux/init.h
@@ -807,29 +808,6 @@ static void ipoib_timeout(struct net_device *dev)
/* XXX reset QP, etc. */
 }
 
-static int ipoib_hard_header(struct sk_buff *skb,
-struct net_device *dev,
-unsigned short type,
-const void *daddr, const void *saddr, unsigned len)
-{
-   struct ipoib_header *header;
-   struct ipoib_cb *cb = ipoib_skb_cb(skb);
-
-   header = (struct ipoib_header *) skb_push(skb, sizeof *header);
-
-   header-proto = htons(type);
-   header-reserved = 0;
-
-   /*
-* we don't rely on dst_entry structure,  always stuff the
-* destination address into skb-cb so we can figure out where
-* to send the packet later.
-*/
-   memcpy(cb-hwaddr, daddr, INFINIBAND_ALEN);
-
-   return sizeof *header;
-}
-
 static void ipoib_set_mcast_list(struct net_device *dev)
 {
struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -1328,10 +1306,6 @@ void ipoib_dev_cleanup(struct net_device *dev)
ipoib_neigh_hash_uninit(dev);
 }
 
-static const struct header_ops ipoib_header_ops = {
-   .create = ipoib_hard_header,
-};
-
 static const struct net_device_ops ipoib_netdev_ops = {
.ndo_uninit  = ipoib_uninit,
.ndo_open= ipoib_open,
diff --git a/include/linux/ibdevice.h b/include/linux/ibdevice.h
new file mode 100644
index 000..8418974
--- /dev/null
+++ b/include/linux/ibdevice.h
@@ -0,0 +1,15 @@
+/*
+ * ipoib   Implementation of ipoib_header_ops here.
+ *
+ * Authors:Wengang Wang wen.gang.w...@oracle.com
+ */
+#ifndef _LINUX_IBDEVICE_H
+#define _LINUX_IBDEVICE_H
+
+#include linux/netdevice.h
+
+#ifdef __KERNEL__
+extern const struct header_ops ipoib_header_ops;
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_IBDEVICE_H */
diff --git a/include/linux/if_infiniband.h b/include/linux/if_infiniband.h
new file mode 100644
index 000..9f2d0cf
--- /dev/null
+++ b/include/linux/if_infiniband.h
@@ -0,0 +1,11 @@
+/*
+ * ipoib   Implementation of ipoib_header_ops here.
+ *
+ * Authors:Wengang Wang wen.gang.w...@oracle.com
+ */
+#ifndef _LINUX_IF_INFINIBAND_H
+#define _LINUX_IF_INFINIBAND_H
+
+#include uapi/linux/if_infiniband.h
+
+#endif /* _LINUX_IF_INFINIBAND_H */
diff --git a/include/uapi/linux/if_infiniband.h 
b/include/uapi/linux/if_infiniband.h
index 7d958475..9190ee3 100644
--- a/include/uapi/linux/if_infiniband.h
+++ b/include/uapi/linux/if_infiniband.h
@@ -21,9 +21,19 @@
  * $Id$
  */
 
-#ifndef _LINUX_IF_INFINIBAND_H
-#define _LINUX_IF_INFINIBAND_H
+#ifndef _UAPI_LINUX_IF_INFINIBAND_H
+#define _UAPI_LINUX_IF_INFINIBAND_H
 
+#include net/sch_generic.h
 #define INFINIBAND_ALEN20  /* Octets in IPoIB HW addr