There is a double cm disconnect during AE processing causing crash. While fixing the crash, also simplify the AE handling code.
Signed-off-by: Faisal Latif <faisal.la...@intel.com> --- kernel_patches/fixes/nes_0025_ae_handling.patch | 164 +++++++++++++++++++++++ 1 files changed, 164 insertions(+), 0 deletions(-) create mode 100644 kernel_patches/fixes/nes_0025_ae_handling.patch diff --git a/kernel_patches/fixes/nes_0025_ae_handling.patch b/kernel_patches/fixes/nes_0025_ae_handling.patch new file mode 100644 index 0000000..0e541ca --- /dev/null +++ b/kernel_patches/fixes/nes_0025_ae_handling.patch @@ -0,0 +1,164 @@ +diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c +index b1c2cbb..310cc7c 100644 +--- a/drivers/infiniband/hw/nes/nes_hw.c ++++ b/drivers/infiniband/hw/nes/nes_hw.c +@@ -3352,8 +3352,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, + u16 async_event_id; + u8 tcp_state; + u8 iwarp_state; +- int must_disconn = 1; +- int must_terminate = 0; + struct ib_event ibevent; + + nes_debug(NES_DBG_AEQ, "\n"); +@@ -3367,6 +3365,8 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, + BUG_ON(!context); + } + ++ /* context is nesqp unless async_event_id == CQ ERROR */ ++ nesqp = (struct nes_qp *)(unsigned long)context; + async_event_id = (u16)aeq_info; + tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT; + iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT; +@@ -3378,8 +3378,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, + + switch (async_event_id) { + case NES_AEQE_AEID_LLP_FIN_RECEIVED: +- nesqp = (struct nes_qp *)(unsigned long)context; +- + if (nesqp->term_flags) + return; /* Ignore it, wait for close complete */ + +@@ -3394,79 +3392,48 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, + async_event_id, nesqp->last_aeq, tcp_state); + } + +- if ((tcp_state != NES_AEQE_TCP_STATE_CLOSE_WAIT) || +- (nesqp->ibqp_state != IB_QPS_RTS)) { +- /* FIN Received but tcp state or IB state moved on, +- should expect a close complete */ +- return; +- } +- ++ break; + case NES_AEQE_AEID_LLP_CLOSE_COMPLETE: +- nesqp = (struct nes_qp *)(unsigned long)context; + if (nesqp->term_flags) { + nes_terminate_done(nesqp, 0); + return; + } ++ spin_lock_irqsave(&nesqp->lock, flags); ++ nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; ++ spin_unlock_irqrestore(&nesqp->lock, flags); ++ nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_CLOSING, 0, 0); ++ nes_cm_disconn(nesqp); ++ break; + +- case NES_AEQE_AEID_LLP_CONNECTION_RESET: + case NES_AEQE_AEID_RESET_SENT: +- nesqp = (struct nes_qp *)(unsigned long)context; +- if (async_event_id == NES_AEQE_AEID_RESET_SENT) { +- tcp_state = NES_AEQE_TCP_STATE_CLOSED; +- } ++ tcp_state = NES_AEQE_TCP_STATE_CLOSED; + spin_lock_irqsave(&nesqp->lock, flags); + nesqp->hw_iwarp_state = iwarp_state; + nesqp->hw_tcp_state = tcp_state; + nesqp->last_aeq = async_event_id; +- +- if ((tcp_state == NES_AEQE_TCP_STATE_CLOSED) || +- (tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT)) { +- nesqp->hte_added = 0; +- next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE; +- } +- +- if ((nesqp->ibqp_state == IB_QPS_RTS) && +- ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) || +- (async_event_id == NES_AEQE_AEID_LLP_CONNECTION_RESET))) { +- switch (nesqp->hw_iwarp_state) { +- case NES_AEQE_IWARP_STATE_RTS: +- next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING; +- nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING; +- break; +- case NES_AEQE_IWARP_STATE_TERMINATE: +- must_disconn = 0; /* terminate path takes care of disconn */ +- if (nesqp->term_flags == 0) +- must_terminate = 1; +- break; +- } +- } else { +- if (async_event_id == NES_AEQE_AEID_LLP_FIN_RECEIVED) { +- /* FIN Received but ib state not RTS, +- close complete will be on its way */ +- must_disconn = 0; +- } +- } ++ nesqp->hte_added = 0; + spin_unlock_irqrestore(&nesqp->lock, flags); ++ next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE; ++ nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); ++ nes_cm_disconn(nesqp); ++ break; + +- if (must_terminate) +- nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); +- else if (must_disconn) { +- if (next_iwarp_state) { +- nes_debug(NES_DBG_AEQ, "issuing hw modifyqp for QP%u. next state = 0x%08X\n", +- nesqp->hwqp.qp_id, next_iwarp_state); +- nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0); +- } +- nes_cm_disconn(nesqp); +- } ++ case NES_AEQE_AEID_LLP_CONNECTION_RESET: ++ if (atomic_read(&nesqp->close_timer_started)) ++ return; ++ spin_lock_irqsave(&nesqp->lock, flags); ++ nesqp->hw_iwarp_state = iwarp_state; ++ nesqp->hw_tcp_state = tcp_state; ++ nesqp->last_aeq = async_event_id; ++ spin_unlock_irqrestore(&nesqp->lock, flags); ++ nes_cm_disconn(nesqp); + break; + + case NES_AEQE_AEID_TERMINATE_SENT: +- nesqp = (struct nes_qp *)(unsigned long)context; + nes_terminate_send_fin(nesdev, nesqp, aeqe); + break; + + case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED: +- nesqp = (struct nes_qp *)(unsigned long)context; + nes_terminate_received(nesdev, nesqp, aeqe); + break; + +@@ -3480,7 +3447,8 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, + case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER: + case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION: + case NES_AEQE_AEID_AMP_TO_WRAP: +- nesqp = (struct nes_qp *)(unsigned long)context; ++ printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_ACCESS_ERR\n", ++ nesqp->hwqp.qp_id, async_event_id); + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR); + break; + +@@ -3488,7 +3456,6 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, + case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL: + case NES_AEQE_AEID_DDP_UBE_INVALID_MO: + case NES_AEQE_AEID_DDP_UBE_INVALID_QN: +- nesqp = (struct nes_qp *)(unsigned long)context; + if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) { + aeq_info &= 0xffff0000; + aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE; +@@ -3530,7 +3497,8 @@ static void nes_process_iwarp_aeqe(struct nes_device *nesdev, + case NES_AEQE_AEID_STAG_ZERO_INVALID: + case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST: + case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP: +- nesqp = (struct nes_qp *)(unsigned long)context; ++ printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n", ++ nesqp->hwqp.qp_id, async_event_id); + nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL); + break; + +-- +1.6.0 + -- 1.6.0 _______________________________________________ ewg mailing list ewg@lists.openfabrics.org http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ewg