Re: [PATCH V3] IB/mlx4: Use vmalloc for WR buffers when needed
Hi Matt, Yes, you are right. Since the patch is already merged in, I am going to make a separated patch for that. thanks, wengang 在 2015年12月12日 04:28, Matthew Finlay 写道: Hi Wengang, I was going through your patch set here, and it seems that you missed changing kfree to kvfree in mlx4_ib_destroy_srq(). In the current code if the srq wrid is allocated using vmalloc, then on cleanup we will use kfree, which is a bug. Thanks, -matt On 10/7/15, 10:27 PM, "linux-rdma-ow...@vger.kernel.org on behalf of Wengang Wang" <linux-rdma-ow...@vger.kernel.org on behalf of wen.gang.w...@oracle.com> wrote: There are several hits that WR buffer allocation(kmalloc) failed. It failed at order 3 and/or 4 contigous pages allocation. At the same time there are actually 100MB+ free memory but well fragmented. So try vmalloc when kmalloc failed. Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com> Acked-by: Or Gerlitz <ogerl...@mellanox.com> --- drivers/infiniband/hw/mlx4/qp.c | 19 +-- drivers/infiniband/hw/mlx4/srq.c | 11 --- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 4ad9be3..3ccbd3a 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -786,8 +787,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_mtt; - qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); - qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp); + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp); + if (!qp->sq.wrid) + qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64), + gfp, PAGE_KERNEL); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp); + if (!qp->rq.wrid) + qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64), + gfp, PAGE_KERNEL); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; @@ -874,8 +881,8 @@ err_wrid: if (qp_has_rq(init_attr)) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db); } else { - kfree(qp->sq.wrid); - kfree(qp->rq.wrid); + kvfree(qp->sq.wrid); + kvfree(qp->rq.wrid); } err_mtt: @@ -1050,8 +1057,8 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, >db); ib_umem_release(qp->umem); } else { - kfree(qp->sq.wrid); - kfree(qp->rq.wrid); + kvfree(qp->sq.wrid); + kvfree(qp->rq.wrid); if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) free_proxy_bufs(>ib_dev, qp); diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index dce5dfe..8d133c4 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "mlx4_ib.h" #include "user.h" @@ -172,8 +173,12 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL); if (!srq->wrid) { - err = -ENOMEM; - goto err_mtt; + srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64), + GFP_KERNEL, PAGE_KERNEL); + if (!srq->wrid) { + err = -ENOMEM; + goto err_mtt; + } } } @@ -204,7 +209,7 @@ err_wrid: if (pd->uobject) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db); else - kfree(srq->wrid); + kvfree(srq->wrid); err_mtt: mlx4_mtt_cleanup(dev->dev, >mtt); -- 2.1.0 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html N�r��y���b�X��ǧv�^�){.n�+{��ٚ�{ay�ʇڙ�,j��f���h���z��w��� ���j:+v���w�j�mzZ+�ݢj"��!tml= -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] IB/mlx4: Replace kfree with kvfree in mlx4_ib_destroy_srq
Commit 0ef2f05c7e02ff99c0b5b583d7dee2cd12b053f2 uses vmalloc for WR buffers when needed and uses kvfree to free the buffers. It missed changing kfree to kvfree in mlx4_ib_destroy_srq(). Reported-by: Matthew Finaly <m...@mellanox.com> Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com> --- drivers/infiniband/hw/mlx4/srq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 8d133c4..c394376 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -286,7 +286,7 @@ int mlx4_ib_destroy_srq(struct ib_srq *srq) mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), >db); ib_umem_release(msrq->umem); } else { - kfree(msrq->wrid); + kvfree(msrq->wrid); mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift, >buf); mlx4_db_free(dev->dev, >db); -- 2.1.0 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V3] IB/mlx4: Use vmalloc for WR buffers when needed
There are several hits that WR buffer allocation(kmalloc) failed. It failed at order 3 and/or 4 contigous pages allocation. At the same time there are actually 100MB+ free memory but well fragmented. So try vmalloc when kmalloc failed. Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com> Acked-by: Or Gerlitz <ogerl...@mellanox.com> --- drivers/infiniband/hw/mlx4/qp.c | 19 +-- drivers/infiniband/hw/mlx4/srq.c | 11 --- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 4ad9be3..3ccbd3a 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -786,8 +787,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_mtt; - qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); - qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp); + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp); + if (!qp->sq.wrid) + qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64), + gfp, PAGE_KERNEL); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp); + if (!qp->rq.wrid) + qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64), + gfp, PAGE_KERNEL); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; @@ -874,8 +881,8 @@ err_wrid: if (qp_has_rq(init_attr)) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db); } else { - kfree(qp->sq.wrid); - kfree(qp->rq.wrid); + kvfree(qp->sq.wrid); + kvfree(qp->rq.wrid); } err_mtt: @@ -1050,8 +1057,8 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, >db); ib_umem_release(qp->umem); } else { - kfree(qp->sq.wrid); - kfree(qp->rq.wrid); + kvfree(qp->sq.wrid); + kvfree(qp->rq.wrid); if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) free_proxy_bufs(>ib_dev, qp); diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index dce5dfe..8d133c4 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "mlx4_ib.h" #include "user.h" @@ -172,8 +173,12 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL); if (!srq->wrid) { - err = -ENOMEM; - goto err_mtt; + srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64), + GFP_KERNEL, PAGE_KERNEL); + if (!srq->wrid) { + err = -ENOMEM; + goto err_mtt; + } } } @@ -204,7 +209,7 @@ err_wrid: if (pd->uobject) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db); else - kfree(srq->wrid); + kvfree(srq->wrid); err_mtt: mlx4_mtt_cleanup(dev->dev, >mtt); -- 2.1.0 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] IPoIB: serialize changing on tx_outstanding
Hi Leon, thanks for review. 在 2015年10月08日 12:33, Leon Romanovsky 写道: On Mon, Sep 28, 2015 at 01:42:10PM +0800, Wengang Wang wrote: The changing on tx_outstanding should be protected by spinlock or to be atomic operations. Such log is found in dmesg: Sep 16 14:20:53 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034733, tx_tail 1034733, tx_outstanding 359 ipoib_sendq_size: 512 Sep 16 14:21:33 naep11x06 kernel: ib0: transmit timeout: latency 9560 msecs Sep 16 14:21:33 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034854, tx_tail 1034854, tx_outstanding 511 ipoib_sendq_size: 512 Sep 16 14:21:38 naep11x06 kernel: ib0: transmit timeout: latency 14568 msecs Sep 16 14:21:38 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034854, tx_tail 1034854, tx_outstanding 511 ipoib_sendq_size: 512 And the send queue of ib0 kept full. When transmit timeout is reported, queue is reported as "stopped", but the IPoIB stuff tx_head and tx_tail points to same value. I am not able to see such numbers in ipoib_cm_tx (for CM) because I have no vmcore. Though I am not quite sure it's caused by parallel access of tx_outstanding(send path VS interrup path), we really need to serialize the changeing on tx_outstanding. This patch also make sure the increase of tx_outstanding prior to the calling of post_send to avoid the possible decreasing before increasing in case the running of increasing is scheduled later than the interrupt handler. Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com> --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 40 +++-- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 24 ++-- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index c78dc16..044da94 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -710,6 +710,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_tx_buf *tx_req; int rc; + unsigned long flags; if (unlikely(skb->len > tx->mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", @@ -742,27 +743,36 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ skb_orphan(skb); skb_dst_drop(skb); + spin_lock_irqsave(>lock, flags); + if (++priv->tx_outstanding == ipoib_sendq_size) { + ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", + tx->qp->qp_num); + netif_stop_queue(dev); + } + spin_unlock_irqrestore(>lock, flags); + if (netif_queue_stopped(dev)) { + rc = ib_req_notify_cq(priv->send_cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc < 0) + ipoib_warn(priv, "request notify on send CQ failed\n"); + else if (rc) + ipoib_send_comp_handler(priv->send_cq, dev); + } + rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req); if (unlikely(rc)) { ipoib_warn(priv, "post_send failed, error %d\n", rc); ++dev->stats.tx_errors; + spin_lock_irqsave(>lock, flags); + --priv->tx_outstanding; + if (netif_queue_stopped(dev)) + netif_wake_queue(dev); + spin_unlock_irqrestore(>lock, flags); Why are you locking the netif_* calls? Yes, I intended to do that. This make the accessing on tx_outstanding and the reopening of the send queue in the same atomic session which is the expected behavior. Otherwise, we may have the following problem: #time order thread1(on cpu1) thread2(on cpu2) lock modify/check tx_outstanding unlock lock modify/check tx_outstanding unlock reopen queue stop queue So that we actually want reopen the send queue, but the result is we stopped it. thanks, wengang ipoib_dma_unmap_tx(priv, tx_req); dev_kfree_skb_any(skb); } else { dev->trans_start = jiffies; ++tx->tx_head; - - if (++priv->tx_outstanding == ipoib_sendq_size) { - ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", - tx->qp->qp_num); - netif_stop_queue(dev); - rc = ib_req_notify_cq(priv->send_cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); - if (rc < 0) - ipoib_warn(priv, "req
Re: [PATCH] IPoIB: serialize changing on tx_outstanding
Hi, Any comment on this patch? thanks, wengang 在 2015年09月28日 13:42, Wengang Wang 写道: The changing on tx_outstanding should be protected by spinlock or to be atomic operations. Such log is found in dmesg: Sep 16 14:20:53 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034733, tx_tail 1034733, tx_outstanding 359 ipoib_sendq_size: 512 Sep 16 14:21:33 naep11x06 kernel: ib0: transmit timeout: latency 9560 msecs Sep 16 14:21:33 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034854, tx_tail 1034854, tx_outstanding 511 ipoib_sendq_size: 512 Sep 16 14:21:38 naep11x06 kernel: ib0: transmit timeout: latency 14568 msecs Sep 16 14:21:38 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034854, tx_tail 1034854, tx_outstanding 511 ipoib_sendq_size: 512 And the send queue of ib0 kept full. When transmit timeout is reported, queue is reported as "stopped", but the IPoIB stuff tx_head and tx_tail points to same value. I am not able to see such numbers in ipoib_cm_tx (for CM) because I have no vmcore. Though I am not quite sure it's caused by parallel access of tx_outstanding(send path VS interrup path), we really need to serialize the changeing on tx_outstanding. This patch also make sure the increase of tx_outstanding prior to the calling of post_send to avoid the possible decreasing before increasing in case the running of increasing is scheduled later than the interrupt handler. Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com> --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 40 +++-- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 24 ++-- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index c78dc16..044da94 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -710,6 +710,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_tx_buf *tx_req; int rc; + unsigned long flags; if (unlikely(skb->len > tx->mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", @@ -742,27 +743,36 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ skb_orphan(skb); skb_dst_drop(skb); + spin_lock_irqsave(>lock, flags); + if (++priv->tx_outstanding == ipoib_sendq_size) { + ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", + tx->qp->qp_num); + netif_stop_queue(dev); + } + spin_unlock_irqrestore(>lock, flags); + if (netif_queue_stopped(dev)) { + rc = ib_req_notify_cq(priv->send_cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc < 0) + ipoib_warn(priv, "request notify on send CQ failed\n"); + else if (rc) + ipoib_send_comp_handler(priv->send_cq, dev); + } + rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req); if (unlikely(rc)) { ipoib_warn(priv, "post_send failed, error %d\n", rc); ++dev->stats.tx_errors; + spin_lock_irqsave(>lock, flags); + --priv->tx_outstanding; + if (netif_queue_stopped(dev)) + netif_wake_queue(dev); + spin_unlock_irqrestore(>lock, flags); ipoib_dma_unmap_tx(priv, tx_req); dev_kfree_skb_any(skb); } else { dev->trans_start = jiffies; ++tx->tx_head; - - if (++priv->tx_outstanding == ipoib_sendq_size) { - ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", - tx->qp->qp_num); - netif_stop_queue(dev); - rc = ib_req_notify_cq(priv->send_cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); - if (rc < 0) - ipoib_warn(priv, "request notify on send CQ failed\n"); - else if (rc) - ipoib_send_comp_handler(priv->send_cq, dev); - } } } @@ -796,10 +806,13 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) netif_tx_lock(dev); ++tx->tx_tail; + + spin_lock_irqsave(>lock, flags); if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && netif_queue_stopped(dev) && test_bit(IPOIB_FLAG_ADMIN_UP, >flags)) netif_wake_queue(dev); +
Re: [PATCH] IB/mlx4: correct order of variables in log
Hi, Any comment on this patch? thanks, wengang 在 2015年09月28日 10:08, Wengang Wang 写道: There is a mis-order in mlx4 log. Fix it. Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com> --- drivers/net/ethernet/mellanox/mlx4/cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 0a32020..150fbb3 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -1010,7 +1010,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED && smp->method == IB_MGMT_METHOD_GET) || network_view) { mlx4_err(dev, "Unprivileged slave %d is trying to execute a Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. Rejecting\n", -slave, smp->method, smp->mgmt_class, +slave, smp->mgmt_class, smp->method, network_view ? "Network" : "Host", be16_to_cpu(smp->attr_id)); return -EPERM; -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] IB/mlx4: correct order of variables in log
Thanks Or. I will resend the revised(title) the patch with your Ack. thanks, wengang 在 2015年10月08日 12:52, Or Gerlitz 写道: On 9/28/2015 5:08 AM, Wengang Wang wrote: There is a mis-order in mlx4 log. Fix it. Signed-off-by: Wengang Wang<wen.gang.w...@oracle.com> I wanted to ack it, but wait... We want commits to our driver to start with Capital letter so please resubmit with this title IB/mlx4: Use correct order of variables in log message You can add Acked-by: Or Gerlitz <ogerl...@mellanox.com> -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V2] IB/mlx4: Use correct order of variables in log message
There is a mis-order in mlx4 log. Fix it. Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com> Acked-by: Or Gerlitz <ogerl...@mellanox.com> --- drivers/net/ethernet/mellanox/mlx4/cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 0a32020..150fbb3 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -1010,7 +1010,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED && smp->method == IB_MGMT_METHOD_GET) || network_view) { mlx4_err(dev, "Unprivileged slave %d is trying to execute a Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. Rejecting\n", -slave, smp->method, smp->mgmt_class, +slave, smp->mgmt_class, smp->method, network_view ? "Network" : "Host", be16_to_cpu(smp->attr_id)); return -EPERM; -- 2.1.0 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] IPoIB: serialize changing on tx_outstanding
The changing on tx_outstanding should be protected by spinlock or to be atomic operations. Such log is found in dmesg: Sep 16 14:20:53 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034733, tx_tail 1034733, tx_outstanding 359 ipoib_sendq_size: 512 Sep 16 14:21:33 naep11x06 kernel: ib0: transmit timeout: latency 9560 msecs Sep 16 14:21:33 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034854, tx_tail 1034854, tx_outstanding 511 ipoib_sendq_size: 512 Sep 16 14:21:38 naep11x06 kernel: ib0: transmit timeout: latency 14568 msecs Sep 16 14:21:38 naep11x06 kernel: ib0: queue stopped 1, tx_head 1034854, tx_tail 1034854, tx_outstanding 511 ipoib_sendq_size: 512 And the send queue of ib0 kept full. When transmit timeout is reported, queue is reported as "stopped", but the IPoIB stuff tx_head and tx_tail points to same value. I am not able to see such numbers in ipoib_cm_tx (for CM) because I have no vmcore. Though I am not quite sure it's caused by parallel access of tx_outstanding(send path VS interrup path), we really need to serialize the changeing on tx_outstanding. This patch also make sure the increase of tx_outstanding prior to the calling of post_send to avoid the possible decreasing before increasing in case the running of increasing is scheduled later than the interrupt handler. Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com> --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 40 +++-- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 24 ++-- 2 files changed, 50 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index c78dc16..044da94 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -710,6 +710,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_tx_buf *tx_req; int rc; + unsigned long flags; if (unlikely(skb->len > tx->mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", @@ -742,27 +743,36 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ skb_orphan(skb); skb_dst_drop(skb); + spin_lock_irqsave(>lock, flags); + if (++priv->tx_outstanding == ipoib_sendq_size) { + ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", + tx->qp->qp_num); + netif_stop_queue(dev); + } + spin_unlock_irqrestore(>lock, flags); + if (netif_queue_stopped(dev)) { + rc = ib_req_notify_cq(priv->send_cq, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + if (rc < 0) + ipoib_warn(priv, "request notify on send CQ failed\n"); + else if (rc) + ipoib_send_comp_handler(priv->send_cq, dev); + } + rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req); if (unlikely(rc)) { ipoib_warn(priv, "post_send failed, error %d\n", rc); ++dev->stats.tx_errors; + spin_lock_irqsave(>lock, flags); + --priv->tx_outstanding; + if (netif_queue_stopped(dev)) + netif_wake_queue(dev); + spin_unlock_irqrestore(>lock, flags); ipoib_dma_unmap_tx(priv, tx_req); dev_kfree_skb_any(skb); } else { dev->trans_start = jiffies; ++tx->tx_head; - - if (++priv->tx_outstanding == ipoib_sendq_size) { - ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", - tx->qp->qp_num); - netif_stop_queue(dev); - rc = ib_req_notify_cq(priv->send_cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); - if (rc < 0) - ipoib_warn(priv, "request notify on send CQ failed\n"); - else if (rc) - ipoib_send_comp_handler(priv->send_cq, dev); - } } } @@ -796,10 +806,13 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) netif_tx_lock(dev); ++tx->tx_tail; + + spin_lock_irqsave(>lock, flags); if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && netif_queue_stopped(dev) && test_bit(IPOIB_FLAG_ADMIN_UP, >flags)) netif_wake_queue(dev); + spin_unlock_irqrestore(>lock, flags); if (wc->status != IB_
[PATCH] IB/mlx4: correct order of variables in log
There is a mis-order in mlx4 log. Fix it. Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com> --- drivers/net/ethernet/mellanox/mlx4/cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 0a32020..150fbb3 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -1010,7 +1010,7 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, if (!(smp->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED && smp->method == IB_MGMT_METHOD_GET) || network_view) { mlx4_err(dev, "Unprivileged slave %d is trying to execute a Subnet MGMT MAD, class 0x%x, method 0x%x, view=%s for attr 0x%x. Rejecting\n", -slave, smp->method, smp->mgmt_class, +slave, smp->mgmt_class, smp->method, network_view ? "Network" : "Host", be16_to_cpu(smp->attr_id)); return -EPERM; -- 2.1.0 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] IB/mlx4: Use vmalloc for WR buffers when needed
There are several hits that WR buffer allocation(kmalloc) failed. It failed at order 3 and/or 4 contigous pages allocation. At the same time there are actually 100MB+ free memory but well fragmented. So try vmalloc when kmalloc failed. Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com> --- drivers/infiniband/hw/mlx4/qp.c | 20 ++-- drivers/infiniband/hw/mlx4/srq.c | 11 --- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 4ad9be3..f152d8a 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -786,8 +787,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_mtt; - qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); - qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp); + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp); + if (!qp->sq.wrid) + qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64), + gfp, PAGE_KERNEL); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp); + if (!qp->rq.wrid) + qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64), + gfp, PAGE_KERNEL); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; @@ -874,8 +881,8 @@ err_wrid: if (qp_has_rq(init_attr)) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db); } else { - kfree(qp->sq.wrid); - kfree(qp->rq.wrid); + kvfree(qp->sq.wrid); + kvfree(qp->rq.wrid); } err_mtt: @@ -1050,8 +1057,9 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, >db); ib_umem_release(qp->umem); } else { - kfree(qp->sq.wrid); - kfree(qp->rq.wrid); + kvfree(qp->sq.wrid); + kvfree(qp->rq.wrid); + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) free_proxy_bufs(>ib_dev, qp); diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index dce5dfe..8d133c4 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "mlx4_ib.h" #include "user.h" @@ -172,8 +173,12 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL); if (!srq->wrid) { - err = -ENOMEM; - goto err_mtt; + srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64), + GFP_KERNEL, PAGE_KERNEL); + if (!srq->wrid) { + err = -ENOMEM; + goto err_mtt; + } } } @@ -204,7 +209,7 @@ err_wrid: if (pd->uobject) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db); else - kfree(srq->wrid); + kvfree(srq->wrid); err_mtt: mlx4_mtt_cleanup(dev->dev, >mtt); -- 2.1.0 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] IB/mlx4: Use vmalloc for WR buffers when needed
在 2015年09月24日 16:07, Or Gerlitz 写道: On 9/24/2015 9:52 AM, Wengang Wang wrote: } else { -kfree(qp->sq.wrid); -kfree(qp->rq.wrid); +if (is_vmalloc_addr(qp->sq.wrid)) +vfree(qp->sq.wrid); +else +kfree(qp->sq.wrid); + +if (is_vmalloc_addr(qp->rq.wrid)) +vfree(qp->rq.wrid); +else +kfree(qp->rq.wrid); } NO just call kvfree, see commit 914efb0 "mlx4: don't duplicate kvfree()" Yeap, will re-post. thanks, wengang -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] mlx4: vmalloc for mlx4_ib_wq.wrid and mlx4_ib_srq.wrid
Hi Or, 在 2015年09月24日 13:33, Or Gerlitz 写道: On 9/24/2015 5:10 AM, Wengang Wang wrote: Use __vmalloc to allocate memory for mlx4_ib_wq.wrid and mlx4_ib_srq.wrid. Several hits that the kmalloc for wrid failed with the following like call back stack: Using vmalloc and friends should be done with care, specifically, we'd like to go there only when needed (namely when kmalloc fails), else we can get into another set of troubles. Please use the practice introduced in commit 89dd86d "mlx4_core: Allow large mlx4_buddy bitmaps" to go the vmalloc way only when needed, note you can just call kvfree later, no need to branch when freeing things (as was fixed later in commit 914efb0 "mlx4: don't duplicate kvfree()") And have the patch title to be IB/mlx4: Use vmalloc for WR buffers when needed I don't think the OOM oops in the change log helps, skip it. OK, will look at commit 89dd86d and try again. thanks for so quick response. thanks wengang Or. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] IB/mlx4: Use vmalloc for WR buffers when needed
There are several hits that WR buffer allocation(kmalloc) failed. It failed at order 3 and/or 4 contigous pages allocation. At the same time there are actually 100MB+ free memory but well fragmented. So try vmalloc when kmalloc failed. Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com> --- drivers/infiniband/hw/mlx4/qp.c | 34 -- drivers/infiniband/hw/mlx4/srq.c | 14 +++--- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 4ad9be3..da551e1 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -786,8 +787,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_mtt; - qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); - qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp); + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp); + if (!qp->sq.wrid) + qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64), + gfp, PAGE_KERNEL); + qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp); + if (!qp->rq.wrid) + qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64), + gfp, PAGE_KERNEL); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; @@ -874,8 +881,15 @@ err_wrid: if (qp_has_rq(init_attr)) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db); } else { - kfree(qp->sq.wrid); - kfree(qp->rq.wrid); + if (is_vmalloc_addr(qp->sq.wrid)) + vfree(qp->sq.wrid); + else + kfree(qp->sq.wrid); + + if (is_vmalloc_addr(qp->rq.wrid)) + vfree(qp->rq.wrid); + else + kfree(qp->rq.wrid); } err_mtt: @@ -1050,8 +1064,16 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, >db); ib_umem_release(qp->umem); } else { - kfree(qp->sq.wrid); - kfree(qp->rq.wrid); + if (is_vmalloc_addr(qp->sq.wrid)) + vfree(qp->sq.wrid); + else + kfree(qp->sq.wrid); + + if (is_vmalloc_addr(qp->rq.wrid)) + vfree(qp->rq.wrid); + else + kfree(qp->rq.wrid); + if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) free_proxy_bufs(>ib_dev, qp); diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index dce5dfe..d0e8d40 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "mlx4_ib.h" #include "user.h" @@ -172,8 +173,12 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL); if (!srq->wrid) { - err = -ENOMEM; - goto err_mtt; + srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64), + GFP_KERNEL, PAGE_KERNEL); + if (!srq->wrid) { + err = -ENOMEM; + goto err_mtt; + } } } @@ -204,7 +209,10 @@ err_wrid: if (pd->uobject) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db); else - kfree(srq->wrid); + if (is_vmalloc_addr(srq->wrid)) + vfree(srq->wrid); + else + kfree(srq->wrid); err_mtt: mlx4_mtt_cleanup(dev->dev, >mtt); -- 2.1.0 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] IB/mlx4: Use vmalloc for WR buffers when needed
Hi Or, 在 2015年09月24日 19:57, Or Gerlitz 写道: On Thu, Sep 24, 2015 at 1:49 PM, Wengang Wang <wen.gang.w...@oracle.com> wrote: @@ -786,8 +787,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_mtt; - qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); - qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp); + qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp); + if (!qp->sq.wrid) + qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64), + gfp, PAGE_KERNEL); On other spots of mlx4, we're using vmalloc and not __vmalloc, any pros/cons for going that way too here? vmalloc is just using GFP_KERNEL | __GFP_HIGHMEM, we can't pass in the flag gfp with it. We should respect orginal code which needs to pass the flag. thanks, wengang -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] mlx4: vmalloc for mlx4_ib_wq.wrid and mlx4_ib_srq.wrid
Use __vmalloc to allocate memory for mlx4_ib_wq.wrid and mlx4_ib_srq.wrid. Several hits that the kmalloc for wrid failed with the following like call back stack: kworker/u:4: page allocation failure: order:4, mode:0x2000d0 Pid: 16388, comm: kworker/u:4 Not tainted Call Trace: [] warn_alloc_failed+0xf3/0x160 [] ? __alloc_pages_direct_compact+0x1fa/0x200 [] __alloc_pages_slowpath+0x4a6/0x7b0 [] __alloc_pages_nodemask+0x2fb/0x320 [] kmem_getpages+0x67/0x1c0 [] fallback_alloc+0x187/0x250 [] cache_alloc_node+0x9a/0x150 [] __kmalloc+0x18b/0x340 [] ? create_qp_common+0x431/0x8e0 [mlx4_ib] [] create_qp_common+0x431/0x8e0 [mlx4_ib] [] ? kzalloc.clone.1+0xe/0x10 [mlx4_ib] [] mlx4_ib_create_qp+0x207/0x310 [mlx4_ib] [] ib_create_qp+0x41/0x1c0 [ib_core] [] ipoib_cm_create_tx_qp+0xc8/0x130 [ib_ipoib] [] ? __vmalloc_node+0x35/0x40 [] ipoib_cm_tx_init+0x65/0x380 [ib_ipoib] [] ? sched_clock_cpu+0xcd/0x110 [] ? xen_mc_flush+0xb0/0x1b0 [] ipoib_cm_tx_start+0x230/0x3d0 [ib_ipoib] [] process_one_work+0x180/0x420 [] worker_thread+0x12e/0x390 [] ? manage_workers+0x180/0x180 [] kthread+0xce/0xe0 [] ? xen_end_context_switch+0x1e/0x30 [] ? kthread_freezable_should_stop+0x70/0x70 [] ret_from_fork+0x7c/0xb0 [] ? kthread_freezable_should_stop+0x70/0x70 It needs 16 contigous pages and failed. At the time there actually is 100MB+ free memory: Node 0 Normal: 10268*4kB (UM) 7443*8kB (UEM) 1647*16kB (UM) 35*32kB (UR) 1*64kB (R) 4*128kB (R) 1*256kB (R) 0*512kB 1*1024kB (R) 0*2048kB 0*4096kB = 129944kB I also hit same errors order 3. Signed-off-by: Wengang Wang <wen.gang.w...@oracle.com> --- drivers/infiniband/hw/mlx4/qp.c | 15 +-- drivers/infiniband/hw/mlx4/srq.c | 6 -- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 4ad9be3..754ceb9 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -786,8 +787,10 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_mtt; - qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp); - qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp); + qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp, +PAGE_KERNEL); + qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp, +PAGE_KERNEL); if (!qp->sq.wrid || !qp->rq.wrid) { err = -ENOMEM; goto err_wrid; @@ -874,8 +877,8 @@ err_wrid: if (qp_has_rq(init_attr)) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db); } else { - kfree(qp->sq.wrid); - kfree(qp->rq.wrid); + vfree(qp->sq.wrid); + vfree(qp->rq.wrid); } err_mtt: @@ -1050,8 +1053,8 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, >db); ib_umem_release(qp->umem); } else { - kfree(qp->sq.wrid); - kfree(qp->rq.wrid); + vfree(qp->sq.wrid); + vfree(qp->rq.wrid); if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) free_proxy_bufs(>ib_dev, qp); diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index dce5dfe..6d21bb2 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "mlx4_ib.h" #include "user.h" @@ -170,7 +171,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_mtt; - srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL); + srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL, + PAGE_KERNEL); if (!srq->wrid) { err = -ENOMEM; goto err_mtt; @@ -204,7 +206,7 @@ err_wrid: if (pd->uobject) mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), >db); else - kfree(srq->wrid); + vfree(srq->wrid); err_mtt: mlx4_mtt_cleanup(dev->dev, >mtt); -- 2.1.0 -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] rds: rds_ib_device.refcount overflow
Doug, No problem. I found the patch picked up. thanks, wengang 在 2015年07月29日 22:36, Doug Ledford 写道: On 07/12/2015 09:18 PM, Wengang Wang wrote: Hi Doug, How do you think about this patch? Sorry, I picked this up already. I must have missed sending out the acknowledgment on this one. thanks, wengang 在 2015年07月06日 14:35, Wengang Wang 写道: Fixes: 3e0249f9c05c (RDS/IB: add refcount tracking to struct rds_ib_device) There lacks a dropping on rds_ib_device.refcount in case rds_ib_alloc_fmr failed(mr pool running out). this lead to the refcount overflow. A complain in line 117(see following) is seen. From vmcore: s_ib_rdma_mr_pool_depleted is 2147485544 and rds_ibdev-refcount is -2147475448. That is the evidence the mr pool is used up. so rds_ib_alloc_fmr is very likely to return ERR_PTR(-EAGAIN). 115 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) 116 { 117 BUG_ON(atomic_read(rds_ibdev-refcount) = 0); 118 if (atomic_dec_and_test(rds_ibdev-refcount)) 119 queue_work(rds_wq, rds_ibdev-free_work); 120 } fix is to drop refcount when rds_ib_alloc_fmr failed. Signed-off-by: Wengang Wang wen.gang.w...@oracle.com Reviewed-by: Haggai Eran hagg...@mellanox.com --- net/rds/ib_rdma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 273b8bf..657ba9f 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -759,8 +759,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, } ibmr = rds_ib_alloc_fmr(rds_ibdev); -if (IS_ERR(ibmr)) +if (IS_ERR(ibmr)) { +rds_ib_dev_put(rds_ibdev); return ibmr; +} ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); if (ret == 0) -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] rds: rds_ib_device.refcount overflow
Hi Doug, How do you think about this patch? thanks, wengang 在 2015年07月06日 14:35, Wengang Wang 写道: Fixes: 3e0249f9c05c (RDS/IB: add refcount tracking to struct rds_ib_device) There lacks a dropping on rds_ib_device.refcount in case rds_ib_alloc_fmr failed(mr pool running out). this lead to the refcount overflow. A complain in line 117(see following) is seen. From vmcore: s_ib_rdma_mr_pool_depleted is 2147485544 and rds_ibdev-refcount is -2147475448. That is the evidence the mr pool is used up. so rds_ib_alloc_fmr is very likely to return ERR_PTR(-EAGAIN). 115 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) 116 { 117 BUG_ON(atomic_read(rds_ibdev-refcount) = 0); 118 if (atomic_dec_and_test(rds_ibdev-refcount)) 119 queue_work(rds_wq, rds_ibdev-free_work); 120 } fix is to drop refcount when rds_ib_alloc_fmr failed. Signed-off-by: Wengang Wang wen.gang.w...@oracle.com Reviewed-by: Haggai Eran hagg...@mellanox.com --- net/rds/ib_rdma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 273b8bf..657ba9f 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -759,8 +759,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, } ibmr = rds_ib_alloc_fmr(rds_ibdev); - if (IS_ERR(ibmr)) + if (IS_ERR(ibmr)) { + rds_ib_dev_put(rds_ibdev); return ibmr; + } ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); if (ret == 0) -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] rds: rds_ib_device.refcount overflow
Haggai, Thanks for review! I will add the message you suggested and re-post. thanks, wengang 在 2015年07月06日 14:18, Haggai Eran 写道: On 24/06/2015 07:54, Wengang Wang wrote: There lacks a dropping on rds_ib_device.refcount in case rds_ib_alloc_fmr failed(mr pool running out). this lead to the refcount overflow. A complain in line 117(see following) is seen. From vmcore: s_ib_rdma_mr_pool_depleted is 2147485544 and rds_ibdev-refcount is -2147475448. That is the evidence the mr pool is used up. so rds_ib_alloc_fmr is very likely to return ERR_PTR(-EAGAIN). 115 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) 116 { 117 BUG_ON(atomic_read(rds_ibdev-refcount) = 0); 118 if (atomic_dec_and_test(rds_ibdev-refcount)) 119 queue_work(rds_wq, rds_ibdev-free_work); 120 } fix is to drop refcount when rds_ib_alloc_fmr failed. Signed-off-by: Wengang Wang wen.gang.w...@oracle.com --- net/rds/ib_rdma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 273b8bf..657ba9f 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -759,8 +759,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, } ibmr = rds_ib_alloc_fmr(rds_ibdev); - if (IS_ERR(ibmr)) + if (IS_ERR(ibmr)) { + rds_ib_dev_put(rds_ibdev); return ibmr; + } ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); if (ret == 0) It seems like the function indeed is missing a put on the rds_ibdev in that case. Reviewed-by: Haggai Eran hagg...@mellanox.com You may also want to add: Fixes: 3e0249f9c05c (RDS/IB: add refcount tracking to struct rds_ib_device) -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] rds: rds_ib_device.refcount overflow
Fixes: 3e0249f9c05c (RDS/IB: add refcount tracking to struct rds_ib_device) There lacks a dropping on rds_ib_device.refcount in case rds_ib_alloc_fmr failed(mr pool running out). this lead to the refcount overflow. A complain in line 117(see following) is seen. From vmcore: s_ib_rdma_mr_pool_depleted is 2147485544 and rds_ibdev-refcount is -2147475448. That is the evidence the mr pool is used up. so rds_ib_alloc_fmr is very likely to return ERR_PTR(-EAGAIN). 115 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) 116 { 117 BUG_ON(atomic_read(rds_ibdev-refcount) = 0); 118 if (atomic_dec_and_test(rds_ibdev-refcount)) 119 queue_work(rds_wq, rds_ibdev-free_work); 120 } fix is to drop refcount when rds_ib_alloc_fmr failed. Signed-off-by: Wengang Wang wen.gang.w...@oracle.com Reviewed-by: Haggai Eran hagg...@mellanox.com --- net/rds/ib_rdma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 273b8bf..657ba9f 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -759,8 +759,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, } ibmr = rds_ib_alloc_fmr(rds_ibdev); - if (IS_ERR(ibmr)) + if (IS_ERR(ibmr)) { + rds_ib_dev_put(rds_ibdev); return ibmr; + } ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); if (ret == 0) -- 2.1.0 -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] rds: rds_ib_device.refcount overflow
Hi Doug, Could you please review this patch? thanks, wengang 在 2015年06月24日 12:54, Wengang Wang 写道: There lacks a dropping on rds_ib_device.refcount in case rds_ib_alloc_fmr failed(mr pool running out). this lead to the refcount overflow. A complain in line 117(see following) is seen. From vmcore: s_ib_rdma_mr_pool_depleted is 2147485544 and rds_ibdev-refcount is -2147475448. That is the evidence the mr pool is used up. so rds_ib_alloc_fmr is very likely to return ERR_PTR(-EAGAIN). 115 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) 116 { 117 BUG_ON(atomic_read(rds_ibdev-refcount) = 0); 118 if (atomic_dec_and_test(rds_ibdev-refcount)) 119 queue_work(rds_wq, rds_ibdev-free_work); 120 } fix is to drop refcount when rds_ib_alloc_fmr failed. Signed-off-by: Wengang Wang wen.gang.w...@oracle.com --- net/rds/ib_rdma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 273b8bf..657ba9f 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -759,8 +759,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, } ibmr = rds_ib_alloc_fmr(rds_ibdev); - if (IS_ERR(ibmr)) + if (IS_ERR(ibmr)) { + rds_ib_dev_put(rds_ibdev); return ibmr; + } ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); if (ret == 0) -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] rds: re-entry of rds_ib_xmit/rds_iw_xmit
Hi, Could anyone review this patch please. thanks, wengang 在 2015年05月21日 13:11, Wengang Wang 写道: The BUG_ON at line 452/453 is triggered in function rds_send_xmit. 441 while (ret) { 442 tmp = min_t(int, ret, sg-length - 443 conn-c_xmit_data_off); 444 conn-c_xmit_data_off += tmp; 445 ret -= tmp; 446 if (conn-c_xmit_data_off == sg-length) { 447 conn-c_xmit_data_off = 0; 448 sg++; 449 conn-c_xmit_sg++; 450 if (ret != 0 conn-c_xmit_sg == rm-data.op_nents) 451 printk(KERN_ERR conn %p rm %p sg %p ret %d\n, conn, rm, sg, ret); 452 BUG_ON(ret != 0 453conn-c_xmit_sg == rm-data.op_nents); 454 } 455 } it is complaining the total sent length is bigger that we want to send. rds_ib_xmit() is wrong for the second entry for the same rds_message returning wrong value. the sg and off passed by rds_send_xmit to rds_ib_xmit is based on scatterlist.offset/length, but the rds_ib_xmit action is based on scatterlist.dma_address/dma_length. in case dma_length is larger than length there is problem. for the 2nd and later entries of rds_ib_xmit for same rds_message, at least one of the following two is wrong: 1) the scatterlist to start with, the choosen one can far beyond the correct one. 2) the offset to start with within the scatterlist. fix: add op_dmasg and op_dmaoff to rm_data_op structure indicating the scatterlist and offset within the it to start with for rds_ib_xmit respectively. op_dmasg and op_dmaoff are initialized to zero when doing dma mapping for the first see of the message and are changed when filling send slots. the same applies to rds_iw_xmit too. Signed-off-by: Wengang Wang wen.gang.w...@oracle.com --- net/rds/ib_send.c | 17 +++-- net/rds/iw_send.c | 18 +++--- net/rds/rds.h | 2 ++ 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index bd3825d..1df6c84 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -605,6 +605,8 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } rds_message_addref(rm); + rm-data.op_dmasg = 0; + rm-data.op_dmaoff = 0; ic-i_data_op = rm-data; /* Finalize the header */ @@ -658,7 +660,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send = ic-i_sends[pos]; first = send; prev = NULL; - scat = ic-i_data_op-op_sg[sg]; + scat = ic-i_data_op-op_sg[rm-data.op_dmasg]; i = 0; do { unsigned int len = 0; @@ -680,17 +682,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* Set up the data, if present */ if (i work_alloc scat != rm-data.op_sg[rm-data.op_count]) { - len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + len = min(RDS_FRAG_SIZE, + ib_sg_dma_len(dev, scat) - rm-data.op_dmaoff); send-s_wr.num_sge = 2; - send-s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; + send-s_sge[1].addr = ib_sg_dma_address(dev, scat); + send-s_sge[1].addr += rm-data.op_dmaoff; send-s_sge[1].length = len; bytes_sent += len; - off += len; - if (off == ib_sg_dma_len(dev, scat)) { + rm-data.op_dmaoff += len; + if (rm-data.op_dmaoff == ib_sg_dma_len(dev, scat)) { scat++; - off = 0; + rm-data.op_dmasg++; + rm-data.op_dmaoff = 0; } } diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 1383478..334fe98 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -581,6 +581,8 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, ic-i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; ic-i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; rds_message_addref(rm); + rm-data.op_dmasg = 0; + rm-data.op_dmaoff = 0; ic-i_rm = rm; /* Finalize the header */ @@ -622,7 +624,7 @@ int rds_iw_xmit(struct rds_connection *conn
[PATCH] rds: re-entry of rds_ib_xmit/rds_iw_xmit
The BUG_ON at line 452/453 is triggered in function rds_send_xmit. 441 while (ret) { 442 tmp = min_t(int, ret, sg-length - 443 conn-c_xmit_data_off); 444 conn-c_xmit_data_off += tmp; 445 ret -= tmp; 446 if (conn-c_xmit_data_off == sg-length) { 447 conn-c_xmit_data_off = 0; 448 sg++; 449 conn-c_xmit_sg++; 450 if (ret != 0 conn-c_xmit_sg == rm-data.op_nents) 451 printk(KERN_ERR conn %p rm %p sg %p ret %d\n, conn, rm, sg, ret); 452 BUG_ON(ret != 0 453conn-c_xmit_sg == rm-data.op_nents); 454 } 455 } it is complaining the total sent length is bigger that we want to send. rds_ib_xmit() is wrong for the second entry for the same rds_message returning wrong value. the sg and off passed by rds_send_xmit to rds_ib_xmit is based on scatterlist.offset/length, but the rds_ib_xmit action is based on scatterlist.dma_address/dma_length. in case dma_length is larger than length there is problem. for the 2nd and later entries of rds_ib_xmit for same rds_message, at least one of the following two is wrong: 1) the scatterlist to start with, the choosen one can far beyond the correct one. 2) the offset to start with within the scatterlist. fix: add op_dmasg and op_dmaoff to rm_data_op structure indicating the scatterlist and offset within the it to start with for rds_ib_xmit respectively. op_dmasg and op_dmaoff are initialized to zero when doing dma mapping for the first see of the message and are changed when filling send slots. the same applies to rds_iw_xmit too. Signed-off-by: Wengang Wang wen.gang.w...@oracle.com --- net/rds/ib_send.c | 17 +++-- net/rds/iw_send.c | 18 +++--- net/rds/rds.h | 2 ++ 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index bd3825d..1df6c84 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -605,6 +605,8 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } rds_message_addref(rm); + rm-data.op_dmasg = 0; + rm-data.op_dmaoff = 0; ic-i_data_op = rm-data; /* Finalize the header */ @@ -658,7 +660,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send = ic-i_sends[pos]; first = send; prev = NULL; - scat = ic-i_data_op-op_sg[sg]; + scat = ic-i_data_op-op_sg[rm-data.op_dmasg]; i = 0; do { unsigned int len = 0; @@ -680,17 +682,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* Set up the data, if present */ if (i work_alloc scat != rm-data.op_sg[rm-data.op_count]) { - len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + len = min(RDS_FRAG_SIZE, + ib_sg_dma_len(dev, scat) - rm-data.op_dmaoff); send-s_wr.num_sge = 2; - send-s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; + send-s_sge[1].addr = ib_sg_dma_address(dev, scat); + send-s_sge[1].addr += rm-data.op_dmaoff; send-s_sge[1].length = len; bytes_sent += len; - off += len; - if (off == ib_sg_dma_len(dev, scat)) { + rm-data.op_dmaoff += len; + if (rm-data.op_dmaoff == ib_sg_dma_len(dev, scat)) { scat++; - off = 0; + rm-data.op_dmasg++; + rm-data.op_dmaoff = 0; } } diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 1383478..334fe98 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -581,6 +581,8 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, ic-i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; ic-i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes; rds_message_addref(rm); + rm-data.op_dmasg = 0; + rm-data.op_dmaoff = 0; ic-i_rm = rm; /* Finalize the header */ @@ -622,7 +624,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, send = ic-i_sends[pos
[PATCH] bonding: move ipoib_header_ops to vmlinux
When last slave of a bonding master is removed, the bonding then does not work. At the time if packet_snd is called against with a master net_device, it calls then header_ops-create which points to slave's header_ops. In case the slave is ipoib and the module is unloaded, header_ops would point to invalid address. Accessing it will cause problem. This patch tries to fix this issue by moving ipoib_header_ops to vmlinux to keep it valid even when ipoib module is unloaded. Signed-off-by: Wengang Wang wen.gang.w...@oracle.com --- drivers/infiniband/ulp/ipoib/ipoib.h | 10 - drivers/infiniband/ulp/ipoib/ipoib_main.c | 28 + include/linux/ibdevice.h | 15 ++ include/linux/if_infiniband.h | 11 ++ include/uapi/linux/if_infiniband.h| 16 --- net/Makefile | 2 +- net/infiniband/Makefile | 5 + net/infiniband/infiniband.c | 34 +++ 8 files changed, 80 insertions(+), 41 deletions(-) create mode 100644 include/linux/ibdevice.h create mode 100644 include/linux/if_infiniband.h create mode 100644 net/infiniband/Makefile create mode 100644 net/infiniband/infiniband.c diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index d7562be..7c25670 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -121,16 +121,6 @@ enum { /* structs */ -struct ipoib_header { - __be16 proto; - u16 reserved; -}; - -struct ipoib_cb { - struct qdisc_skb_cb qdisc_cb; - u8 hwaddr[INFINIBAND_ALEN]; -}; - static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb) { BUILD_BUG_ON(sizeof(skb-cb) sizeof(struct ipoib_cb)); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 58b5aa3..9233085 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -34,6 +34,7 @@ #include ipoib.h +#include linux/ibdevice.h #include linux/module.h #include linux/init.h @@ -807,29 +808,6 @@ static void ipoib_timeout(struct net_device *dev) /* XXX reset QP, etc. */ } -static int ipoib_hard_header(struct sk_buff *skb, -struct net_device *dev, -unsigned short type, -const void *daddr, const void *saddr, unsigned len) -{ - struct ipoib_header *header; - struct ipoib_cb *cb = ipoib_skb_cb(skb); - - header = (struct ipoib_header *) skb_push(skb, sizeof *header); - - header-proto = htons(type); - header-reserved = 0; - - /* -* we don't rely on dst_entry structure, always stuff the -* destination address into skb-cb so we can figure out where -* to send the packet later. -*/ - memcpy(cb-hwaddr, daddr, INFINIBAND_ALEN); - - return sizeof *header; -} - static void ipoib_set_mcast_list(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -1328,10 +1306,6 @@ void ipoib_dev_cleanup(struct net_device *dev) ipoib_neigh_hash_uninit(dev); } -static const struct header_ops ipoib_header_ops = { - .create = ipoib_hard_header, -}; - static const struct net_device_ops ipoib_netdev_ops = { .ndo_uninit = ipoib_uninit, .ndo_open= ipoib_open, diff --git a/include/linux/ibdevice.h b/include/linux/ibdevice.h new file mode 100644 index 000..8418974 --- /dev/null +++ b/include/linux/ibdevice.h @@ -0,0 +1,15 @@ +/* + * ipoib Implementation of ipoib_header_ops here. + * + * Authors:Wengang Wang wen.gang.w...@oracle.com + */ +#ifndef _LINUX_IBDEVICE_H +#define _LINUX_IBDEVICE_H + +#include linux/netdevice.h + +#ifdef __KERNEL__ +extern const struct header_ops ipoib_header_ops; +#endif /* __KERNEL__ */ + +#endif /* _LINUX_IBDEVICE_H */ diff --git a/include/linux/if_infiniband.h b/include/linux/if_infiniband.h new file mode 100644 index 000..9f2d0cf --- /dev/null +++ b/include/linux/if_infiniband.h @@ -0,0 +1,11 @@ +/* + * ipoib Implementation of ipoib_header_ops here. + * + * Authors:Wengang Wang wen.gang.w...@oracle.com + */ +#ifndef _LINUX_IF_INFINIBAND_H +#define _LINUX_IF_INFINIBAND_H + +#include uapi/linux/if_infiniband.h + +#endif /* _LINUX_IF_INFINIBAND_H */ diff --git a/include/uapi/linux/if_infiniband.h b/include/uapi/linux/if_infiniband.h index 7d958475..9190ee3 100644 --- a/include/uapi/linux/if_infiniband.h +++ b/include/uapi/linux/if_infiniband.h @@ -21,9 +21,19 @@ * $Id$ */ -#ifndef _LINUX_IF_INFINIBAND_H -#define _LINUX_IF_INFINIBAND_H +#ifndef _UAPI_LINUX_IF_INFINIBAND_H +#define _UAPI_LINUX_IF_INFINIBAND_H +#include net/sch_generic.h #define INFINIBAND_ALEN20 /* Octets in IPoIB HW addr
Re: [PATCH] bonding: move ipoib_header_ops to vmlinux
Hi David and Jay, Then about about the change in this patch? thanks, wengang 在 2014年11月26日 09:30, Wengang 写道: 于 2014年11月26日 02:44, David Miller 写道: From: Jay Vosburgh jay.vosbu...@canonical.com Date: Tue, 25 Nov 2014 10:41:17 -0800 Or Gerlitz ogerl...@mellanox.com wrote: On 11/25/2014 8:07 AM, David Miller wrote: IPOIB should not work over bonding as it requires that the device use ARPHRD_ETHER. Hi Dave, IPoIB devices can be enslaved to both bonding and teaming in their HA mode, the bond device type becomes ARPHRD_INFINIBAND when this happens. The point was that pktgen disallows ARPHRD_INFINIBAND, not that bonding does. Pktgen specifically checks for type != ARPHRD_ETHER, so the IPoIB bond should not be able to be used with pkgten. My suspicion is that pktgen is being configured on the bond first, then an IPoIB slave is added to the bond; this would change its type in a way that pktgen wouldn't notice. +1 I think it go this way: 1) bond_master is ready 2) bond_enslave enslave a IPOIB interface calling bond_setup_by_slave 3) then bond_setup_by_slave set change master type to ARPHRD_INFINIBAND. code is like this: 1 /* enslave device slave to bond device master */ 2 int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) 3 { 4 snip... 5 /* set bonding device ether type by slave - bonding netdevices are 6 * created with ether_setup, so when the slave type is not ARPHRD_ETHER 7 * there is a need to override some of the type dependent attribs/funcs. 8 * 9 * bond ether type mutual exclusion - don't allow slaves of dissimilar 10 * ether type (eg ARPHRD_ETHER and ARPHRD_INFINIBAND) share the same bond 11 */ 12 if (!bond_has_slaves(bond)) { 13 if (bond_dev-type != slave_dev-type) { 14 snip... 15 if (slave_dev-type != ARPHRD_ETHER) 16 bond_setup_by_slave(bond_dev, slave_dev); 17 else { 18 ether_setup(bond_dev); 19 bond_dev-priv_flags = ~IFF_TX_SKB_SHARING; 20 } 21 22 call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, 23 bond_dev); 24 } 25 snip... 26 } 27 28 static void bond_setup_by_slave(struct net_device *bond_dev, 29 struct net_device *slave_dev) 30 { 31 bond_dev-header_ops = slave_dev-header_ops; 32 33 bond_dev-type = slave_dev-type; 34 bond_dev-hard_header_len = slave_dev-hard_header_len; 35 bond_dev-addr_len = slave_dev-addr_len; 36 37 memcpy(bond_dev-broadcast, slave_dev-broadcast, 38 slave_dev-addr_len); 39 } 40 thanks wengang -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] bonding: move ipoib_header_ops to vmlinux
When last slave of a bonding master is removed, the bonding then does not work. At the time if packet_snd is called against with a master net_device, it calls then header_ops-create which points to slave's header_ops. In case the slave is ipoib and the module is unloaded, header_ops would point to invalid address. Accessing it will cause problem. This patch tries to fix this issue by moving ipoib_header_ops to vmlinux to keep it valid even when ipoib module is unloaded. Signed-off-by: Wengang Wang wen.gang.w...@oracle.com --- drivers/infiniband/ulp/ipoib/ipoib.h | 10 - drivers/infiniband/ulp/ipoib/ipoib_main.c | 28 + include/linux/ibdevice.h | 15 ++ include/linux/if_infiniband.h | 11 ++ include/uapi/linux/if_infiniband.h| 16 --- net/Makefile | 2 +- net/infiniband/Makefile | 5 + net/infiniband/infiniband.c | 34 +++ 8 files changed, 80 insertions(+), 41 deletions(-) create mode 100644 include/linux/ibdevice.h create mode 100644 include/linux/if_infiniband.h create mode 100644 net/infiniband/Makefile create mode 100644 net/infiniband/infiniband.c diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index d7562be..7c25670 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -121,16 +121,6 @@ enum { /* structs */ -struct ipoib_header { - __be16 proto; - u16 reserved; -}; - -struct ipoib_cb { - struct qdisc_skb_cb qdisc_cb; - u8 hwaddr[INFINIBAND_ALEN]; -}; - static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb) { BUILD_BUG_ON(sizeof(skb-cb) sizeof(struct ipoib_cb)); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 58b5aa3..9233085 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -34,6 +34,7 @@ #include ipoib.h +#include linux/ibdevice.h #include linux/module.h #include linux/init.h @@ -807,29 +808,6 @@ static void ipoib_timeout(struct net_device *dev) /* XXX reset QP, etc. */ } -static int ipoib_hard_header(struct sk_buff *skb, -struct net_device *dev, -unsigned short type, -const void *daddr, const void *saddr, unsigned len) -{ - struct ipoib_header *header; - struct ipoib_cb *cb = ipoib_skb_cb(skb); - - header = (struct ipoib_header *) skb_push(skb, sizeof *header); - - header-proto = htons(type); - header-reserved = 0; - - /* -* we don't rely on dst_entry structure, always stuff the -* destination address into skb-cb so we can figure out where -* to send the packet later. -*/ - memcpy(cb-hwaddr, daddr, INFINIBAND_ALEN); - - return sizeof *header; -} - static void ipoib_set_mcast_list(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -1328,10 +1306,6 @@ void ipoib_dev_cleanup(struct net_device *dev) ipoib_neigh_hash_uninit(dev); } -static const struct header_ops ipoib_header_ops = { - .create = ipoib_hard_header, -}; - static const struct net_device_ops ipoib_netdev_ops = { .ndo_uninit = ipoib_uninit, .ndo_open= ipoib_open, diff --git a/include/linux/ibdevice.h b/include/linux/ibdevice.h new file mode 100644 index 000..8418974 --- /dev/null +++ b/include/linux/ibdevice.h @@ -0,0 +1,15 @@ +/* + * ipoib Implementation of ipoib_header_ops here. + * + * Authors:Wengang Wang wen.gang.w...@oracle.com + */ +#ifndef _LINUX_IBDEVICE_H +#define _LINUX_IBDEVICE_H + +#include linux/netdevice.h + +#ifdef __KERNEL__ +extern const struct header_ops ipoib_header_ops; +#endif /* __KERNEL__ */ + +#endif /* _LINUX_IBDEVICE_H */ diff --git a/include/linux/if_infiniband.h b/include/linux/if_infiniband.h new file mode 100644 index 000..9f2d0cf --- /dev/null +++ b/include/linux/if_infiniband.h @@ -0,0 +1,11 @@ +/* + * ipoib Implementation of ipoib_header_ops here. + * + * Authors:Wengang Wang wen.gang.w...@oracle.com + */ +#ifndef _LINUX_IF_INFINIBAND_H +#define _LINUX_IF_INFINIBAND_H + +#include uapi/linux/if_infiniband.h + +#endif /* _LINUX_IF_INFINIBAND_H */ diff --git a/include/uapi/linux/if_infiniband.h b/include/uapi/linux/if_infiniband.h index 7d958475..9190ee3 100644 --- a/include/uapi/linux/if_infiniband.h +++ b/include/uapi/linux/if_infiniband.h @@ -21,9 +21,19 @@ * $Id$ */ -#ifndef _LINUX_IF_INFINIBAND_H -#define _LINUX_IF_INFINIBAND_H +#ifndef _UAPI_LINUX_IF_INFINIBAND_H +#define _UAPI_LINUX_IF_INFINIBAND_H +#include net/sch_generic.h #define INFINIBAND_ALEN20 /* Octets in IPoIB HW addr