There exists a race condition when using c4iw_wr_wait objects that are declared on the stack. This was being done in a few places where we are sending work requests to the FW and awaiting replies, but we don't have an endpoint structure with an embedded c4iw_wr_wait struct. So the code was allocating it locally on the stack. Bad design. The race is this:
thread on cpuX declares the wait object on the stack, then posts a firmware WR with that wait object ptr as the cookie to be returned in the WR reply. This thread will proceed to block in wait_event_timeout() but before it does: An interrupt runs on cpuY with the WR reply. fw6_msg() handles this and calls c4iw_wake_up(). c4iw_wake_up() sets the condition variable in the c4iw_wr_wait object to TRUE and will call wake_up(), but before it calls wake_up(): The thread on cpuX calls c4iw_wait_for_reply(), which calls wait_event_timeout(). The wait_event_timeout() macro checks the condition variable and returns immediately since it is TRUE. So this thread never blocks/sleeps. The function then returns effectively deallocating the c4iw_wr_wait object that was on the stack. So at this point cpuY has a pointer to the c4iw_wr_wait object that is no longer valid. Further its pointing to a stack frame that might now be in use by some other context/thread. So cpuY continues execution and calls wake_up() on a ptr to a wait object that as been effectively deallocated. This race, when it hits, can cause a crash in wake_up(), which I've seen under heavy stress. It can also corrupt the referenced stack which can cause any number of failures. The fix: Allocate temporary c4iw_wr_wait objects and keep references on them so that it is never freed until both execution paths are done using it. Signed-off-by: Steve Wise <sw...@opengridcomputing.com> --- drivers/infiniband/hw/cxgb4/cm.c | 12 +++++++++++- drivers/infiniband/hw/cxgb4/cq.c | 33 ++++++++++++++++++++++---------- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 29 +++++++++++++++++++++++++++- drivers/infiniband/hw/cxgb4/mem.c | 10 ++++++---- drivers/infiniband/hw/cxgb4/qp.c | 16 +++++++++++----- 5 files changed, 79 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index d7ee70f..fcbc803 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -255,6 +255,13 @@ static void *alloc_ep(int size, gfp_t gfp) return epc; } +void _c4iw_free_tmp_wr_wait(struct kref *kref) +{ + struct c4iw_wr_wait *wr_waitp; + wr_waitp = container_of(kref, struct c4iw_wr_wait, kref); + kfree(wr_waitp); +} + void _c4iw_free_ep(struct kref *kref) { struct c4iw_ep *ep; @@ -2284,8 +2291,11 @@ static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb) ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff); wr_waitp = (struct c4iw_wr_wait *)(__force unsigned long) rpl->data[1]; PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret); - if (wr_waitp) + if (wr_waitp) { c4iw_wake_up(wr_waitp, ret ? -ret : 0); + if (wr_waitp->tmp) + c4iw_put_tmp_wr_wait(wr_waitp); + } kfree_skb(skb); break; case 2: diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 8d8f8ad..4fbe82e 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -38,7 +38,7 @@ static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, struct fw_ri_res_wr *res_wr; struct fw_ri_res *res; int wr_len; - struct c4iw_wr_wait wr_wait; + struct c4iw_wr_wait *wr_waitp; struct sk_buff *skb; int ret; @@ -46,6 +46,13 @@ static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, skb = alloc_skb(wr_len, GFP_KERNEL); if (!skb) return -ENOMEM; + + wr_waitp = c4iw_get_tmp_wr_wait(); + if (!wr_waitp) { + kfree_skb(skb); + return -ENOMEM; + } + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len); @@ -55,16 +62,15 @@ static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, V_FW_RI_RES_WR_NRES(1) | FW_WR_COMPL(1)); res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); - res_wr->cookie = (unsigned long) &wr_wait; + res_wr->cookie = (unsigned long) wr_waitp; res = res_wr->res; res->u.cq.restype = FW_RI_RES_TYPE_CQ; res->u.cq.op = FW_RI_RES_OP_RESET; res->u.cq.iqid = cpu_to_be32(cq->cqid); - c4iw_init_wr_wait(&wr_wait); ret = c4iw_ofld_send(rdev, skb); if (!ret) { - ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); + ret = c4iw_wait_for_reply(rdev, wr_waitp, 0, 0, __func__); } kfree(cq->sw_queue); @@ -82,7 +88,7 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, struct fw_ri_res *res; int wr_len; int user = (uctx != &rdev->uctx); - struct c4iw_wr_wait wr_wait; + struct c4iw_wr_wait *wr_waitp; int ret; struct sk_buff *skb; @@ -116,6 +122,13 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, ret = -ENOMEM; goto err4; } + + wr_waitp = c4iw_get_tmp_wr_wait(); + if (!wr_waitp) { + ret = -ENOMEM; + goto err5; + } + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len); @@ -125,7 +138,7 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, V_FW_RI_RES_WR_NRES(1) | FW_WR_COMPL(1)); res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); - res_wr->cookie = (unsigned long) &wr_wait; + res_wr->cookie = (unsigned long) wr_waitp; res = res_wr->res; res->u.cq.restype = FW_RI_RES_TYPE_CQ; res->u.cq.op = FW_RI_RES_OP_WRITE; @@ -144,13 +157,11 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, res->u.cq.iqsize = cpu_to_be16(cq->size); res->u.cq.iqaddr = cpu_to_be64(cq->dma_addr); - c4iw_init_wr_wait(&wr_wait); - ret = c4iw_ofld_send(rdev, skb); if (ret) goto err4; - PDBG("%s wait_event wr_wait %p\n", __func__, &wr_wait); - ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); + PDBG("%s wait_event wr_wait %p\n", __func__, wr_waitp); + ret = c4iw_wait_for_reply(rdev, wr_waitp, 0, 0, __func__); if (ret) goto err4; @@ -163,6 +174,8 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, cq->ugts &= PAGE_MASK; } return 0; +err5: + kfree_skb(skb); err4: dma_free_coherent(&rdev->lldi.pdev->dev, cq->memsize, cq->queue, dma_unmap_addr(cq, mapping)); diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 35d2a5d..63d9ca8 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -139,15 +139,39 @@ struct c4iw_wr_wait { wait_queue_head_t wait; unsigned long status; int ret; + struct kref kref; + int tmp; }; static inline void c4iw_init_wr_wait(struct c4iw_wr_wait *wr_waitp) { + wr_waitp->tmp = 0; wr_waitp->ret = 0; wr_waitp->status = 0; init_waitqueue_head(&wr_waitp->wait); } +static inline struct c4iw_wr_wait *c4iw_get_tmp_wr_wait(void) +{ + struct c4iw_wr_wait *wr_waitp; + + wr_waitp = kmalloc(sizeof *wr_waitp, GFP_KERNEL); + if (wr_waitp) { + c4iw_init_wr_wait(wr_waitp); + kref_init(&wr_waitp->kref); + kref_get(&wr_waitp->kref); + wr_waitp->tmp = 1; + } + return wr_waitp; +} + +void _c4iw_free_tmp_wr_wait(struct kref *kref); + +static inline void c4iw_put_tmp_wr_wait(struct c4iw_wr_wait *wr_waitp) +{ + kref_put(&wr_waitp->kref, _c4iw_free_tmp_wr_wait); +} + static inline void c4iw_wake_up(struct c4iw_wr_wait *wr_waitp, int ret) { wr_waitp->ret = ret; @@ -180,7 +204,10 @@ static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev, if (wr_waitp->ret) PDBG("%s: FW reply %d tid %u qpid %u\n", pci_name(rdev->lldi.pdev), wr_waitp->ret, hwtid, qpid); - return wr_waitp->ret; + ret = wr_waitp->ret; + if (wr_waitp->tmp) + c4iw_put_tmp_wr_wait(wr_waitp); + return ret; } struct c4iw_dev { diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 273ffe4..685928d 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -46,12 +46,14 @@ static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len, struct ulptx_idata *sc; u8 wr_len, *to_dp, *from_dp; int copy_len, num_wqe, i, ret = 0; - struct c4iw_wr_wait wr_wait; + struct c4iw_wr_wait *wr_waitp; addr &= 0x7FFFFFF; PDBG("%s addr 0x%x len %u\n", __func__, addr, len); num_wqe = DIV_ROUND_UP(len, C4IW_MAX_INLINE_SIZE); - c4iw_init_wr_wait(&wr_wait); + wr_waitp = c4iw_get_tmp_wr_wait(); + if (!wr_waitp) + return -ENOMEM; for (i = 0; i < num_wqe; i++) { copy_len = len > C4IW_MAX_INLINE_SIZE ? C4IW_MAX_INLINE_SIZE : @@ -71,7 +73,7 @@ static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len, if (i == (num_wqe-1)) { req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR) | FW_WR_COMPL(1)); - req->wr.wr_lo = (__force __be64)(unsigned long) &wr_wait; + req->wr.wr_lo = (__force __be64)(unsigned long) wr_waitp; } else req->wr.wr_hi = cpu_to_be32(FW_WR_OP(FW_ULPTX_WR)); req->wr.wr_mid = cpu_to_be32( @@ -103,7 +105,7 @@ static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len, len -= C4IW_MAX_INLINE_SIZE; } - ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__); + ret = c4iw_wait_for_reply(rdev, wr_waitp, 0, 0, __func__); return ret; } diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index a1824a5..d295a20 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -115,7 +115,7 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, struct fw_ri_res_wr *res_wr; struct fw_ri_res *res; int wr_len; - struct c4iw_wr_wait wr_wait; + struct c4iw_wr_wait *wr_waitp; struct sk_buff *skb; int ret; int eqsize; @@ -191,6 +191,12 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, ret = -ENOMEM; goto err7; } + wr_waitp = c4iw_get_tmp_wr_wait(); + if (!wr_waitp) { + ret = -ENOMEM; + goto err8; + } + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len); @@ -200,7 +206,7 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, V_FW_RI_RES_WR_NRES(2) | FW_WR_COMPL(1)); res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); - res_wr->cookie = (unsigned long) &wr_wait; + res_wr->cookie = (unsigned long) wr_waitp; res = res_wr->res; res->u.sqrq.restype = FW_RI_RES_TYPE_SQ; res->u.sqrq.op = FW_RI_RES_OP_WRITE; @@ -250,12 +256,10 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, res->u.sqrq.eqid = cpu_to_be32(wq->rq.qid); res->u.sqrq.eqaddr = cpu_to_be64(wq->rq.dma_addr); - c4iw_init_wr_wait(&wr_wait); - ret = c4iw_ofld_send(rdev, skb); if (ret) goto err7; - ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, wq->sq.qid, __func__); + ret = c4iw_wait_for_reply(rdev, wr_waitp, 0, wq->sq.qid, __func__); if (ret) goto err7; @@ -264,6 +268,8 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, (unsigned long long)wq->sq.udb, (unsigned long long)wq->rq.udb); return 0; +err8: + kfree_skb(skb); err7: dma_free_coherent(&(rdev->lldi.pdev->dev), wq->rq.memsize, wq->rq.queue, -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html