Can you provide sample code to use these new features ? --CQ
> -----Original Message----- > From: [EMAIL PROTECTED] > [mailto:[EMAIL PROTECTED] On Behalf Of > Jack Morgenstein > Sent: Wednesday, October 10, 2007 10:44 AM > To: [email protected] > Cc: Roland Dreier > Subject: [ofa-general] [PATCH v5] IB/mlx4: shrinking WQE > > commit c0aa89f0b295dd0c20b2ff2b1d2eca10cdc84f4b > Author: Michael S. Tsirkin <[EMAIL PROTECTED]> > Date: Thu Aug 30 15:51:40 2007 +0300 > > IB/mlx4: shrinking WQE > > ConnectX supports shrinking wqe, such that a single WR can include > multiple units of wqe_shift. This way, WRs can differ in > size, and > do not have to be a power of 2 in size, saving memory and > speeding up > send WR posting. Unfortunately, if we do this wqe_index > field in CQE > can't be used to look up the WR ID anymore, so do this only if > selective signalling is off. > > Further, on 32-bit platforms, we can't use vmap to make > the QP buffer virtually contigious. Thus we have to use > constant-sized WRs to make sure a WR is always fully within > a single page-sized chunk. > > Finally, we use WR with NOP opcode to avoid wrap-around > in the middle of WR. We set NoErrorCompletion bit to avoid getting > completions with error for NOP WRs. Since NEC is only supported > starting with firmware 2.2.232, we use constant-sized WRs > for older firmware. And, since MLX QPs only support SEND, we use > constant-sized WRs in this case. > > Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]> > > --- > > Changes since v4: fix calls to stamp_send_wqe, and stamping placement > inside post_nop_wqe. > Found by regression, fixed by Jack Morgenstein. > Changes since v3: fix nop formatting. > Found by Eli Cohen. > Changes since v2: fix memory leak in mlx4_buf_alloc. > Found by internal code review. > changes since v1: add missing patch hunks > > Index: infiniband/drivers/infiniband/hw/mlx4/cq.c > =================================================================== > --- infiniband.orig/drivers/infiniband/hw/mlx4/cq.c > 2007-10-10 17:12:05.184757000 +0200 > +++ infiniband/drivers/infiniband/hw/mlx4/cq.c > 2007-10-10 17:23:02.337140000 +0200 > @@ -331,6 +331,12 @@ static int mlx4_ib_poll_one(struct mlx4_ > is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == > MLX4_CQE_OPCODE_ERROR; > > + if (unlikely((cqe->owner_sr_opcode & > MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP && > + is_send)) { > + printk(KERN_WARNING "Completion for NOP opcode > detected!\n"); > + return -EINVAL; > + } > + > if (!*cur_qp || > (be32_to_cpu(cqe->my_qpn) & 0xffffff) != > (*cur_qp)->mqp.qpn) { > /* > @@ -353,8 +359,10 @@ static int mlx4_ib_poll_one(struct mlx4_ > > if (is_send) { > wq = &(*cur_qp)->sq; > - wqe_ctr = be16_to_cpu(cqe->wqe_index); > - wq->tail += (u16) (wqe_ctr - (u16) wq->tail); > + if (!(*cur_qp)->sq_signal_bits) { > + wqe_ctr = be16_to_cpu(cqe->wqe_index); > + wq->tail += (u16) (wqe_ctr - (u16) wq->tail); > + } > wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; > ++wq->tail; > } else if ((*cur_qp)->ibqp.srq) { > Index: infiniband/drivers/infiniband/hw/mlx4/mlx4_ib.h > =================================================================== > --- infiniband.orig/drivers/infiniband/hw/mlx4/mlx4_ib.h > 2007-10-10 17:21:17.844882000 +0200 > +++ infiniband/drivers/infiniband/hw/mlx4/mlx4_ib.h > 2007-10-10 17:23:02.341138000 +0200 > @@ -120,6 +120,8 @@ struct mlx4_ib_qp { > > u32 doorbell_qpn; > __be32 sq_signal_bits; > + unsigned sq_next_wqe; > + int sq_max_wqes_per_wr; > int sq_spare_wqes; > struct mlx4_ib_wq sq; > > Index: infiniband/drivers/infiniband/hw/mlx4/qp.c > =================================================================== > --- infiniband.orig/drivers/infiniband/hw/mlx4/qp.c > 2007-10-10 17:21:17.853882000 +0200 > +++ infiniband/drivers/infiniband/hw/mlx4/qp.c > 2007-10-10 17:23:02.350137000 +0200 > @@ -30,6 +30,7 @@ > * SOFTWARE. > */ > > +#include <linux/log2.h> > #include <rdma/ib_cache.h> > #include <rdma/ib_pack.h> > > @@ -92,7 +93,7 @@ static int is_qp0(struct mlx4_ib_dev *de > > static void *get_wqe(struct mlx4_ib_qp *qp, int offset) { > - if (qp->buf.nbufs == 1) > + if (BITS_PER_LONG == 64 || qp->buf.nbufs == 1) > return qp->buf.u.direct.buf + offset; > else > return qp->buf.u.page_list[offset >> > PAGE_SHIFT].buf + @@ -111,16 +112,88 @@ static void > *get_send_wqe(struct mlx4_ib > > /* > * Stamp a SQ WQE so that it is invalid if prefetched by marking the > - * first four bytes of every 64 byte chunk with 0xffffffff, > except for > - * the very first chunk of the WQE. > + * first four bytes of every 64 byte chunk with > + * 0x7FFFFFF | (invalid_ownership_value << 31). > + * > + * When max WR is than or equal to the WQE size, > + * as an optimization, we can stamp WQE with 0xffffffff, > + * and skip the very first chunk of the WQE. > */ > -static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n) > +static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) > { > - u32 *wqe = get_send_wqe(qp, n); > + u32 *wqe; > int i; > + int s; > + int ind; > + void *buf; > + __be32 stamp; > + > + s = roundup(size, 1 << qp->sq.wqe_shift); > + if (qp->sq_max_wqes_per_wr > 1) { > + for (i = 0; i < s; i += 64) { > + ind = (i >> qp->sq.wqe_shift) + n; > + stamp = ind & qp->sq.wqe_cnt ? > cpu_to_be32(0x7fffffff) : > + > cpu_to_be32(0xffffffff); > + buf = get_send_wqe(qp, ind & > (qp->sq.wqe_cnt - 1)); > + wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1)); > + *wqe = stamp; > + } > + } else { > + buf = get_send_wqe(qp, n); > + for (i = 64; i < s; i += 64) { > + wqe = buf + i; > + *wqe = 0xffffffff; > + } > + } > +} > + > +static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) { > + struct mlx4_wqe_ctrl_seg *ctrl; > + struct mlx4_wqe_inline_seg *inl; > + void *wqe; > + int s; > + > + ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); > + s = sizeof(struct mlx4_wqe_ctrl_seg); > + > + if (qp->ibqp.qp_type == IB_QPT_UD) { > + struct mlx4_wqe_datagram_seg *dgram = wqe + > sizeof *ctrl; > + struct mlx4_av *av = (struct mlx4_av *)dgram->av; > + memset(dgram, 0, sizeof *dgram); > + av->port_pd = cpu_to_be32((qp->port << 24) | > to_mpd(qp->ibqp.pd)->pdn); > + s += sizeof(struct mlx4_wqe_datagram_seg); > + } > + > + /* Pad the remainder of the WQE with an inline data segment. */ > + if (size > s) { > + inl = wqe + s; > + inl->byte_count = cpu_to_be32(1 << 31 | (size - > s - sizeof *inl)); > + } > + ctrl->srcrb_flags = 0; > + ctrl->fence_size = size / 16; > + /* > + * Make sure descriptor is fully written before > + * setting ownership bit (because HW can start > + * executing as soon as we do). > + */ > + wmb(); > > - for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16) > - wqe[i] = 0xffffffff; > + ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | > MLX4_WQE_CTRL_NEC) | > + (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); > + > + stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); } > + > +/* Post NOP WQE to prevent wrap-around in the middle of WR */ static > +inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) { > + unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1)); > + if (unlikely(s < qp->sq_max_wqes_per_wr)) { > + post_nop_wqe(qp, ind, s << qp->sq.wqe_shift); > + ind += s; > + } > + return ind; > } > > static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum > mlx4_event type) @@ -237,6 +310,8 @@ static int > set_rq_size(struct mlx4_ib_de static int > set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, > enum ib_qp_type type, struct > mlx4_ib_qp *qp) { > + int s; > + > /* Sanity check SQ size before proceeding */ > if (cap->max_send_wr > dev->dev->caps.max_wqes || > cap->max_send_sge > dev->dev->caps.max_sq_sg || > @@ -252,20 +327,69 @@ static int set_kernel_sq_size(struct mlx > cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) > return -EINVAL; > > - qp->sq.wqe_shift = > ilog2(roundup_pow_of_two(max(cap->max_send_sge * > - sizeof > (struct mlx4_wqe_data_seg), > - > cap->max_inline_data + > - sizeof > (struct mlx4_wqe_inline_seg)) + > - send_wqe_overhead(type))); > - qp->sq.max_gs = ((1 << qp->sq.wqe_shift) - > send_wqe_overhead(type)) / > - sizeof (struct mlx4_wqe_data_seg); > + s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), > + cap->max_inline_data + sizeof (struct > mlx4_wqe_inline_seg)) + > + send_wqe_overhead(type); > > /* > - * We need to leave 2 KB + 1 WQE of headroom in the SQ to > - * allow HW to prefetch. > + * Hermon supports shrinking wqe, such that a single WR > can include > + * multiple units of wqe_shift. This way, WRs can > differ in size, and > + * do not have to be a power of 2 in size, saving > memory and speeding up > + * send WR posting. Unfortunately, if we do this > wqe_index field in CQE > + * can't be used to look up the WR ID anymore, so do > this only if > + * selective signalling is off. > + * > + * Further, on 32-bit platforms, we can't use vmap to make > + * the QP buffer virtually contigious. Thus we have to use > + * constant-sized WRs to make sure a WR is always fully within > + * a single page-sized chunk. > + * > + * Finally, we use NOP opcode to avoid wrap-around in > the middle of WR. > + * We set NEC bit to avoid getting completions with > error for NOP WRs. > + * Since NEC is only supported starting with firmware 2.2.232, > + * we use constant-sized WRs for older firmware. > + * > + * And, since MLX QPs only support SEND, we use > constant-sized WRs in this > + * case. > + * > + * We look for the smallest value of wqe_shift such > that the resulting > + * number of wqes does not exceed device capabilities. > + * > + * We set WQE size to at least 64 bytes, this way > stamping invalidates each WQE. > */ > - qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; > - qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + > qp->sq_spare_wqes); > + if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && > + qp->sq_signal_bits && BITS_PER_LONG == 64 && > + type != IB_QPT_SMI && type != IB_QPT_GSI) > + qp->sq.wqe_shift = ilog2(64); > + else > + qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); > + > + for (;;) { > + if (1 << qp->sq.wqe_shift > > dev->dev->caps.max_sq_desc_sz) > + return -EINVAL; > + > + qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1 << > qp->sq.wqe_shift); > + > + /* > + * We need to leave 2 KB + 1 WR of headroom in the SQ to > + * allow HW to prefetch. > + */ > + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) > + qp->sq_max_wqes_per_wr; > + qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * > + > qp->sq_max_wqes_per_wr + > + qp->sq_spare_wqes); > + > + if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) > + break; > + > + if (qp->sq_max_wqes_per_wr <= 1) > + return -EINVAL; > + > + ++qp->sq.wqe_shift; > + } > + > + qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) - > + send_wqe_overhead(type)) / sizeof > (struct mlx4_wqe_data_seg); > > qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + > (qp->sq.wqe_cnt << qp->sq.wqe_shift); @@ -277,7 > +401,8 @@ static int set_kernel_sq_size(struct mlx > qp->sq.offset = 0; > } > > - cap->max_send_wr = qp->sq.max_post = qp->sq.wqe_cnt - > qp->sq_spare_wqes; > + cap->max_send_wr = qp->sq.max_post = > + (qp->sq.wqe_cnt - qp->sq_spare_wqes) / > qp->sq_max_wqes_per_wr; > cap->max_send_sge = qp->sq.max_gs; > /* We don't support inline sends for kernel QPs (yet) */ > cap->max_inline_data = 0; > @@ -315,6 +440,12 @@ static int create_qp_common(struct mlx4_ > qp->rq.tail = 0; > qp->sq.head = 0; > qp->sq.tail = 0; > + qp->sq_next_wqe = 0; > + > + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) > + qp->sq_signal_bits = > cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); > + else > + qp->sq_signal_bits = 0; > > err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, > !!init_attr->srq, qp); > if (err) > @@ -405,11 +536,6 @@ static int create_qp_common(struct mlx4_ > */ > qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); > > - if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) > - qp->sq_signal_bits = > cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); > - else > - qp->sq_signal_bits = 0; > - > qp->mqp.event = mlx4_ib_qp_event; > > return 0; > @@ -904,7 +1030,7 @@ static int __mlx4_ib_modify_qp(struct ib > ctrl = get_send_wqe(qp, i); > ctrl->owner_opcode = cpu_to_be32(1 << 31); > > - stamp_send_wqe(qp, i); > + stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift); > } > } > > @@ -1266,13 +1392,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp > unsigned long flags; > int nreq; > int err = 0; > - int ind; > - int size; > + unsigned ind; > + int uninitialized_var(stamp); > + int uninitialized_var(size); > int i; > > spin_lock_irqsave(&qp->rq.lock, flags); > > - ind = qp->sq.head; > + ind = qp->sq_next_wqe; > > for (nreq = 0; wr; ++nreq, wr = wr->next) { > if (mlx4_wq_overflow(&qp->sq, nreq, > qp->ibqp.send_cq)) { @@ -1288,7 +1415,7 @@ int > mlx4_ib_post_send(struct ib_qp *ibqp > } > > ctrl = wqe = get_send_wqe(qp, ind & > (qp->sq.wqe_cnt - 1)); > - qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; > + qp->sq.wrid[(qp->sq.head + nreq) & > (qp->sq.wqe_cnt - 1)] = wr->wr_id; > > ctrl->srcrb_flags = > (wr->send_flags & IB_SEND_SIGNALED ? > @@ -1401,16 +1528,23 @@ int mlx4_ib_post_send(struct ib_qp *ibqp > ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | > (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 > << 31) : 0); > > + stamp = ind + qp->sq_spare_wqes; > + ind += DIV_ROUND_UP(size * 16, 1 << qp->sq.wqe_shift); > + > /* > * We can improve latency by not stamping the last > * send queue WQE until after ringing the doorbell, so > * only stamp here if there are still more WQEs to post. > + * > + * Same optimization applies to padding with NOP wqe > + * in case of WQE shrinking (used to prevent wrap-around > + * in the middle of WR). > */ > - if (wr->next) > - stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & > - (qp->sq.wqe_cnt - 1)); > + if (wr->next) { > + stamp_send_wqe(qp, stamp, size * 16); > + ind = pad_wraparound(qp, ind); > + } > > - ++ind; > } > > out: > @@ -1432,8 +1566,10 @@ out: > */ > mmiowb(); > > - stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & > - (qp->sq.wqe_cnt - 1)); > + stamp_send_wqe(qp, stamp, size * 16); > + > + ind = pad_wraparound(qp, ind); > + qp->sq_next_wqe = ind; > } > > spin_unlock_irqrestore(&qp->rq.lock, flags); > Index: infiniband/drivers/net/mlx4/alloc.c > =================================================================== > --- infiniband.orig/drivers/net/mlx4/alloc.c 2007-10-10 > 17:12:12.259502000 +0200 > +++ infiniband/drivers/net/mlx4/alloc.c 2007-10-10 > 17:23:02.356137000 +0200 > @@ -151,6 +151,19 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, > > memset(buf->u.page_list[i].buf, 0, PAGE_SIZE); > } > + > + if (BITS_PER_LONG == 64) { > + struct page **pages; > + pages = kmalloc(sizeof *pages * > buf->nbufs, GFP_KERNEL); > + if (!pages) > + goto err_free; > + for (i = 0; i < buf->nbufs; ++i) > + pages[i] = > virt_to_page(buf->u.page_list[i].buf); > + buf->u.direct.buf = vmap(pages, > buf->nbufs, VM_MAP, PAGE_KERNEL); > + kfree(pages); > + if (!buf->u.direct.buf) > + goto err_free; > + } > } > > return 0; > @@ -170,6 +183,9 @@ void mlx4_buf_free(struct mlx4_dev *dev, > dma_free_coherent(&dev->pdev->dev, size, > buf->u.direct.buf, > buf->u.direct.map); > else { > + if (BITS_PER_LONG == 64) > + vunmap(buf->u.direct.buf); > + > for (i = 0; i < buf->nbufs; ++i) > dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, > buf->u.page_list[i].buf, > Index: infiniband/include/linux/mlx4/device.h > =================================================================== > --- infiniband.orig/include/linux/mlx4/device.h > 2007-10-10 17:21:17.954882000 +0200 > +++ infiniband/include/linux/mlx4/device.h 2007-10-10 > 17:23:02.363137000 +0200 > @@ -133,6 +133,11 @@ enum { > MLX4_STAT_RATE_OFFSET = 5 > }; > > +static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor) { > + return (major << 32) | (minor << 16) | subminor; } > + > struct mlx4_caps { > u64 fw_ver; > int num_ports; > @@ -189,7 +194,7 @@ struct mlx4_buf_list { }; > > struct mlx4_buf { > - union { > + struct { > struct mlx4_buf_list direct; > struct mlx4_buf_list *page_list; > } u; > Index: infiniband/include/linux/mlx4/qp.h > =================================================================== > --- infiniband.orig/include/linux/mlx4/qp.h 2007-10-10 > 17:12:38.460566000 +0200 > +++ infiniband/include/linux/mlx4/qp.h 2007-10-10 > 17:23:02.366140000 +0200 > @@ -154,7 +154,11 @@ struct mlx4_qp_context { > u32 reserved5[10]; > }; > > +/* Which firmware version adds support for NEC > (NoErrorCompletion) bit > +*/ #define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232) > + > enum { > + MLX4_WQE_CTRL_NEC = 1 << 29, > MLX4_WQE_CTRL_FENCE = 1 << 6, > MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2, > MLX4_WQE_CTRL_SOLICITED = 1 << 1, > _______________________________________________ > general mailing list > [email protected] > http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general > > To unsubscribe, please visit > http://openib.org/mailman/listinfo/openib-general > _______________________________________________ general mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
