Here are the kernel driver changes that go with the user library changes just posted.
Index: src/linux-kernel/infiniband/hw/ipath/ipath_qp.c =================================================================== --- src/linux-kernel/infiniband/hw/ipath/ipath_qp.c (revision 8021) +++ src/linux-kernel/infiniband/hw/ipath/ipath_qp.c (working copy) @@ -425,11 +425,12 @@ * @ibqp: the queue pair who's attributes we're modifying * @attr: the new attributes * @attr_mask: the mask of attributes to modify + * @udata: not used by the InfiniPath verbs driver * * Returns 0 on success, otherwise returns an errno. */ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask) + int attr_mask, struct ib_udata *udata) { struct ipath_ibdev *dev = to_idev(ibqp->device); struct ipath_qp *qp = to_iqp(ibqp); Index: src/linux-kernel/infiniband/hw/ipath/ipath_ruc.c =================================================================== --- src/linux-kernel/infiniband/hw/ipath/ipath_ruc.c (revision 8021) +++ src/linux-kernel/infiniband/hw/ipath/ipath_ruc.c (working copy) @@ -105,6 +105,54 @@ spin_unlock_irqrestore(&dev->pending_lock, flags); } +static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + int user = to_ipd(qp->ibqp.pd)->user; + int i, j, ret; + struct ib_wc wc; + + qp->r_len = 0; + for (i = j = 0; i < wqe->num_sge; i++) { + if (wqe->sg_list[i].length == 0) + continue; + /* Check LKEY */ + if ((user && wqe->sg_list[i].lkey == 0) || + !ipath_lkey_ok(&dev->lk_table, + &qp->r_sg_list[j], &wqe->sg_list[i], + IB_ACCESS_LOCAL_WRITE)) + goto bad_lkey; + qp->r_len += wqe->sg_list[i].length; + j++; + } + qp->r_sge.sge = qp->r_sg_list[0]; + qp->r_sge.sg_list = qp->r_sg_list + 1; + qp->r_sge.num_sge = j; + ret = 1; + goto bail; + +bad_lkey: + wc.wr_id = wqe->wr_id; + wc.status = IB_WC_LOC_PROT_ERR; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.byte_len = 0; + wc.imm_data = 0; + wc.qp_num = qp->ibqp.qp_num; + wc.src_qp = 0; + wc.wc_flags = 0; + wc.pkey_index = 0; + wc.slid = 0; + wc.sl = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal solicited completion event. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + ret = 0; +bail: + return ret; +} + /** * ipath_get_rwqe - copy the next RWQE into the QP's RWQE * @qp: the QP @@ -118,73 +166,69 @@ { unsigned long flags; struct ipath_rq *rq; + struct ipath_rwq *wq; struct ipath_srq *srq; struct ipath_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + u32 tail; int ret; - if (!qp->ibqp.srq) { + if (qp->ibqp.srq) { + srq = to_isrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; + rq = &srq->rq; + } else { + srq = NULL; + handler = NULL; rq = &qp->r_rq; - spin_lock_irqsave(&rq->lock, flags); + } - if (unlikely(rq->tail == rq->head)) { + spin_lock_irqsave(&rq->lock, flags); + wq = rq->wq; + tail = wq->tail; + do { + if (unlikely(tail == wq->head)) { + spin_unlock_irqrestore(&rq->lock, flags); ret = 0; goto bail; } - wqe = get_rwqe_ptr(rq, rq->tail); - qp->r_wr_id = wqe->wr_id; - if (!wr_id_only) { - qp->r_sge.sge = wqe->sg_list[0]; - qp->r_sge.sg_list = wqe->sg_list + 1; - qp->r_sge.num_sge = wqe->num_sge; - qp->r_len = wqe->length; - } - if (++rq->tail >= rq->size) - rq->tail = 0; - goto done; - } + wqe = get_rwqe_ptr(rq, tail); + if (++tail >= rq->size) + tail = 0; + } while (!wr_id_only && !init_sge(qp, wqe)); + qp->r_wr_id = wqe->wr_id; + wq->tail = tail; - srq = to_isrq(qp->ibqp.srq); - rq = &srq->rq; - spin_lock_irqsave(&rq->lock, flags); - - if (unlikely(rq->tail == rq->head)) { - ret = 0; - goto bail; - } - wqe = get_rwqe_ptr(rq, rq->tail); - qp->r_wr_id = wqe->wr_id; - if (!wr_id_only) { - qp->r_sge.sge = wqe->sg_list[0]; - qp->r_sge.sg_list = wqe->sg_list + 1; - qp->r_sge.num_sge = wqe->num_sge; - qp->r_len = wqe->length; - } - if (++rq->tail >= rq->size) - rq->tail = 0; - if (srq->ibsrq.event_handler) { - struct ib_event ev; + ret = 1; + if (handler) { u32 n; - if (rq->head < rq->tail) - n = rq->size + rq->head - rq->tail; + /* + * validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; else - n = rq->head - rq->tail; + n -= tail; if (n < srq->limit) { + struct ib_event ev; + srq->limit = 0; spin_unlock_irqrestore(&rq->lock, flags); ev.device = qp->ibqp.device; ev.element.srq = qp->ibqp.srq; ev.event = IB_EVENT_SRQ_LIMIT_REACHED; - srq->ibsrq.event_handler(&ev, - srq->ibsrq.srq_context); - spin_lock_irqsave(&rq->lock, flags); + handler(&ev, srq->ibsrq.srq_context); + goto bail; } } -done: - ret = 1; + spin_unlock_irqrestore(&rq->lock, flags); bail: - spin_unlock_irqrestore(&rq->lock, flags); return ret; } Index: src/linux-kernel/infiniband/hw/ipath/Makefile =================================================================== --- src/linux-kernel/infiniband/hw/ipath/Makefile (revision 8021) +++ src/linux-kernel/infiniband/hw/ipath/Makefile (working copy) @@ -25,6 +25,7 @@ ipath_cq.o \ ipath_keys.o \ ipath_mad.o \ + ipath_mmap.o \ ipath_mr.o \ ipath_qp.o \ ipath_rc.o \ Index: src/linux-kernel/infiniband/hw/ipath/ipath_verbs.c =================================================================== --- src/linux-kernel/infiniband/hw/ipath/ipath_verbs.c (revision 8021) +++ src/linux-kernel/infiniband/hw/ipath/ipath_verbs.c (working copy) @@ -280,11 +280,12 @@ struct ib_recv_wr **bad_wr) { struct ipath_qp *qp = to_iqp(ibqp); + struct ipath_rwq *wq = qp->r_rq.wq; unsigned long flags; int ret; /* Check that state is OK to post receive. */ - if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK)) { + if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) { *bad_wr = wr; ret = -EINVAL; goto bail; @@ -293,59 +294,31 @@ for (; wr; wr = wr->next) { struct ipath_rwqe *wqe; u32 next; - int i, j; + int i; - if (wr->num_sge > qp->r_rq.max_sge) { + if ((unsigned) wr->num_sge > qp->r_rq.max_sge) { *bad_wr = wr; ret = -ENOMEM; goto bail; } spin_lock_irqsave(&qp->r_rq.lock, flags); - next = qp->r_rq.head + 1; + next = wq->head + 1; if (next >= qp->r_rq.size) next = 0; - if (next == qp->r_rq.tail) { + if (next == wq->tail) { spin_unlock_irqrestore(&qp->r_rq.lock, flags); *bad_wr = wr; ret = -ENOMEM; goto bail; } - wqe = get_rwqe_ptr(&qp->r_rq, qp->r_rq.head); + wqe = get_rwqe_ptr(&qp->r_rq, wq->head); wqe->wr_id = wr->wr_id; - wqe->sg_list[0].mr = NULL; - wqe->sg_list[0].vaddr = NULL; - wqe->sg_list[0].length = 0; - wqe->sg_list[0].sge_length = 0; - wqe->length = 0; - for (i = 0, j = 0; i < wr->num_sge; i++) { - /* Check LKEY */ - if (to_ipd(qp->ibqp.pd)->user && - wr->sg_list[i].lkey == 0) { - spin_unlock_irqrestore(&qp->r_rq.lock, - flags); - *bad_wr = wr; - ret = -EINVAL; - goto bail; - } - if (wr->sg_list[i].length == 0) - continue; - if (!ipath_lkey_ok( - &to_idev(qp->ibqp.device)->lk_table, - &wqe->sg_list[j], &wr->sg_list[i], - IB_ACCESS_LOCAL_WRITE)) { - spin_unlock_irqrestore(&qp->r_rq.lock, - flags); - *bad_wr = wr; - ret = -EINVAL; - goto bail; - } - wqe->length += wr->sg_list[i].length; - j++; - } - wqe->num_sge = j; - qp->r_rq.head = next; + wqe->num_sge = wr->num_sge; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[i] = wr->sg_list[i]; + wq->head = next; spin_unlock_irqrestore(&qp->r_rq.lock, flags); } ret = 0; @@ -694,7 +667,7 @@ ipath_layer_get_lastibcstat(dev->dd) & 0xf]; props->port_cap_flags = dev->port_cap_flags; props->gid_tbl_len = 1; - props->max_msg_sz = 4096; + props->max_msg_sz = 0x80000000; props->pkey_tbl_len = ipath_layer_get_npkeys(dev->dd); props->bad_pkey_cntr = ipath_layer_get_cr_errpkey(dev->dd) - dev->z_pkey_violations; @@ -871,7 +844,7 @@ goto bail; } - if (ah_attr->port_num != 1 || + if (ah_attr->port_num < 1 || ah_attr->port_num > pd->device->phys_port_cnt) { ret = ERR_PTR(-EINVAL); goto bail; @@ -883,6 +856,8 @@ goto bail; } + dev->n_ahs_allocated++; + /* ib_create_ah() will initialize ah->ibah. */ ah->attr = *ah_attr; @@ -1137,6 +1112,7 @@ dev->attach_mcast = ipath_multicast_attach; dev->detach_mcast = ipath_multicast_detach; dev->process_mad = ipath_process_mad; + dev->mmap = ipath_mmap; snprintf(dev->node_desc, sizeof(dev->node_desc), IPATH_IDSTR " %s kernel_SMA", system_utsname.nodename); Index: src/linux-kernel/infiniband/hw/ipath/ipath_verbs.h =================================================================== --- src/linux-kernel/infiniband/hw/ipath/ipath_verbs.h (revision 8021) +++ src/linux-kernel/infiniband/hw/ipath/ipath_verbs.h (working copy) @@ -577,7 +577,7 @@ int ipath_destroy_qp(struct ib_qp *ibqp); int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask); + int attr_mask, struct ib_udata *udata); int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_qp_init_attr *init_attr); @@ -636,7 +636,8 @@ struct ib_udata *udata); int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, - enum ib_srq_attr_mask attr_mask); + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata); int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); Index: src/linux-kernel/infiniband/hw/ipath/ipath_mmap.c =================================================================== --- src/linux-kernel/infiniband/hw/ipath/ipath_mmap.c (revision 0) +++ src/linux-kernel/infiniband/hw/ipath/ipath_mmap.c (revision 0) @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2006 PathScale, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/config.h> +#include <linux/module.h> + +#include <linux/mm.h> +#include <linux/errno.h> +#include <asm/pgtable.h> + +#include "ipath_verbs.h" + +/** + * ipath_release_mmap_info - free mmap info structure + * @ref: a pointer to the kref within struct ipath_mmap_info + */ +void ipath_release_mmap_info(struct kref *ref) +{ + struct ipath_mmap_info *ip = + container_of(ref, struct ipath_mmap_info, ref); + + vfree(ip->obj); + kfree(ip); +} + +/* + * open and close keep track of how many times the CQ is mapped, + * to avoid releasing it. + */ +static void ipath_vma_open(struct vm_area_struct *vma) +{ + struct ipath_mmap_info *ip = vma->vm_private_data; + + kref_get(&ip->ref); + ip->mmap_cnt++; +} + +static void ipath_vma_close(struct vm_area_struct *vma) +{ + struct ipath_mmap_info *ip = vma->vm_private_data; + + ip->mmap_cnt--; + kref_put(&ip->ref, ipath_release_mmap_info); +} + +/* + * ipath_vma_nopage - handle a VMA page fault. + */ +static struct page *ipath_vma_nopage(struct vm_area_struct *vma, + unsigned long address, int *type) +{ + struct ipath_mmap_info *ip = vma->vm_private_data; + unsigned long offset = address - vma->vm_start; + struct page *page = NOPAGE_SIGBUS; + void *pageptr; + + if (offset >= ip->size) + goto out; /* out of range */ + + /* + * Convert the vmalloc address into a struct page. + */ + pageptr = (void *)(offset + (vma->vm_pgoff << PAGE_SHIFT)); + page = vmalloc_to_page(pageptr); + + /* Increment the reference count. */ + get_page(page); + if (type) + *type = VM_FAULT_MINOR; +out: + return page; +} + +static struct vm_operations_struct ipath_vm_ops = { + .open = ipath_vma_open, + .close = ipath_vma_close, + .nopage = ipath_vma_nopage, +}; + +/** + * ipath_mmap - create a new mmap region + * @context: the IB user context of the process making the mmap() call + * @vma: the VMA to be initialized + * Return zero if the mmap is OK. Otherwise, return an errno. + */ +int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) +{ + struct ipath_ibdev *dev = to_idev(context->device); + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long size = vma->vm_end - vma->vm_start; + struct ipath_mmap_info *ip, **pp; + + /* + * Search the device's list of objects waiting for a mmap call. + * Normally, this list is very short since a call to create a + * CQ, QP, or SRQ is soon followed by a call to mmap(). + */ + spin_lock_irq(&dev->pending_lock); + for (pp = &dev->pending_mmaps; (ip = *pp); pp = &ip->next) { + /* Only the creator is allowed to mmap the object */ + if (context != ip->context || (void *) offset != ip->obj) + continue; + /* Don't allow a mmap larger than the object. */ + if (size > ip->size) + break; + + *pp = ip->next; + spin_unlock_irq(&dev->pending_lock); + + vma->vm_ops = &ipath_vm_ops; + vma->vm_flags |= VM_RESERVED; + vma->vm_private_data = ip; + ipath_vma_open(vma); + return 0; + } + spin_unlock_irq(&dev->pending_lock); + return -EINVAL; +} Index: src/linux-kernel/infiniband/hw/ipath/ipath_cq.c =================================================================== --- src/linux-kernel/infiniband/hw/ipath/ipath_cq.c (revision 8021) +++ src/linux-kernel/infiniband/hw/ipath/ipath_cq.c (working copy) @@ -41,20 +41,28 @@ * @entry: work completion entry to add * @sig: true if @entry is a solicitated entry * - * This may be called with one of the qp->s_lock or qp->r_rq.lock held. + * This may be called with qp->s_lock held. */ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited) { + struct ipath_cq_wc *wc = cq->queue; unsigned long flags; + u32 head; u32 next; spin_lock_irqsave(&cq->lock, flags); - if (cq->head == cq->ibcq.cqe) + /* + * Note that the head pointer might be writable by user processes. + * Take care to verify it is a sane value. + */ + head = wc->head; + if (head >= (unsigned) cq->ibcq.cqe) { + head = cq->ibcq.cqe; next = 0; - else - next = cq->head + 1; - if (unlikely(next == cq->tail)) { + } else + next = head + 1; + if (unlikely(next == wc->tail)) { spin_unlock_irqrestore(&cq->lock, flags); if (cq->ibcq.event_handler) { struct ib_event ev; @@ -66,8 +74,8 @@ } return; } - cq->queue[cq->head] = *entry; - cq->head = next; + wc->queue[head] = *entry; + wc->head = next; if (cq->notify == IB_CQ_NEXT_COMP || (cq->notify == IB_CQ_SOLICITED && solicited)) { @@ -100,19 +108,20 @@ int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) { struct ipath_cq *cq = to_icq(ibcq); + struct ipath_cq_wc *wc = cq->queue; unsigned long flags; int npolled; spin_lock_irqsave(&cq->lock, flags); for (npolled = 0; npolled < num_entries; ++npolled, ++entry) { - if (cq->tail == cq->head) + if (wc->tail == wc->head) break; - *entry = cq->queue[cq->tail]; - if (cq->tail == cq->ibcq.cqe) - cq->tail = 0; + *entry = wc->queue[wc->tail]; + if (wc->tail >= cq->ibcq.cqe) + wc->tail = 0; else - cq->tail++; + wc->tail++; } spin_unlock_irqrestore(&cq->lock, flags); @@ -159,7 +168,7 @@ { struct ipath_ibdev *dev = to_idev(ibdev); struct ipath_cq *cq; - struct ib_wc *wc; + struct ipath_cq_wc *wc; struct ib_cq *ret; if (entries > ib_ipath_max_cqes) { @@ -172,10 +181,7 @@ goto bail; } - /* - * Need to use vmalloc() if we want to support large #s of - * entries. - */ + /* Allocate the completion queue structure. */ cq = kmalloc(sizeof(*cq), GFP_KERNEL); if (!cq) { ret = ERR_PTR(-ENOMEM); @@ -183,15 +189,54 @@ } /* - * Need to use vmalloc() if we want to support large #s of entries. + * Allocate the completion queue entries and head/tail pointers. + * This is allocated separately so that it can be resized and + * also mapped into user space. + * We need to use vmalloc() in order to support mmap and large + * numbers of entries. */ - wc = vmalloc(sizeof(*wc) * (entries + 1)); + wc = vmalloc(sizeof(*wc) + sizeof(struct ib_wc) * entries); if (!wc) { - kfree(cq); ret = ERR_PTR(-ENOMEM); - goto bail; + goto free_cq; } + /* + * Return the address of the WC as the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata) { + struct ipath_mmap_info *ip; + __u64 offset = (__u64) wc; + int err; + + err = ib_copy_to_udata(udata, &offset, sizeof(offset)); + if (err) { + ret = ERR_PTR(err); + goto free_wc; + } + + /* Allocate info for ipath_mmap(). */ + ip = kmalloc(sizeof(*ip), GFP_KERNEL); + if (!ip) { + ret = ERR_PTR(-ENOMEM); + goto free_wc; + } + cq->ip = ip; + ip->context = context; + ip->obj = wc; + kref_init(&ip->ref); + ip->mmap_cnt = 0; + ip->size = PAGE_ALIGN(sizeof(*wc) + + sizeof(struct ib_wc) * entries); + spin_lock_irq(&dev->pending_lock); + ip->next = dev->pending_mmaps; + dev->pending_mmaps = ip; + spin_unlock_irq(&dev->pending_lock); + } else + cq->ip = NULL; + + /* * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe. * The number of entries should be >= the number requested or return * an error. @@ -201,14 +246,18 @@ cq->triggered = 0; spin_lock_init(&cq->lock); tasklet_init(&cq->comptask, send_complete, (unsigned long)cq); - cq->head = 0; - cq->tail = 0; + wc->head = 0; + wc->tail = 0; cq->queue = wc; ret = &cq->ibcq; - dev->n_cqs_allocated++; + goto bail; +free_wc: + vfree(wc); +free_cq: + kfree(cq); bail: return ret; } @@ -228,7 +277,10 @@ tasklet_kill(&cq->comptask); dev->n_cqs_allocated--; - vfree(cq->queue); + if (cq->ip) + kref_put(&cq->ip->ref, ipath_release_mmap_info); + else + vfree(cq->queue); kfree(cq); return 0; @@ -252,7 +304,7 @@ spin_lock_irqsave(&cq->lock, flags); /* * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow - * any other transitions. + * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2). */ if (cq->notify != IB_CQ_NEXT_COMP) cq->notify = notify; @@ -263,46 +315,87 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { struct ipath_cq *cq = to_icq(ibcq); - struct ib_wc *wc, *old_wc; - u32 n; + struct ipath_cq_wc *old_wc = cq->queue; + struct ipath_cq_wc *wc; + u32 head, tail, n; int ret; + /* Don't allow resize if completion queue is mmapped. */ + if (cq->ip && cq->ip->mmap_cnt) { + ret = -EBUSY; + goto bail; + } + /* * Need to use vmalloc() if we want to support large #s of entries. */ - wc = vmalloc(sizeof(*wc) * (cqe + 1)); + wc = vmalloc(sizeof(*wc) + sizeof(struct ib_wc) * cqe); if (!wc) { ret = -ENOMEM; goto bail; } + /* + * Return the address of the WC as the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata) { + __u64 offset = (__u64) wc; + + ret = ib_copy_to_udata(udata, &offset, sizeof(offset)); + if (ret) + goto bail; + } + spin_lock_irq(&cq->lock); - if (cq->head < cq->tail) - n = cq->ibcq.cqe + 1 + cq->head - cq->tail; + /* + * Make sure head and tail are sane since they + * might be user writable. + */ + head = old_wc->head; + if (head > (u32) cq->ibcq.cqe) + head = (u32) cq->ibcq.cqe; + tail = old_wc->tail; + if (tail > (u32) cq->ibcq.cqe) + tail = (u32) cq->ibcq.cqe; + if (head < tail) + n = cq->ibcq.cqe + 1 + head - tail; else - n = cq->head - cq->tail; + n = head - tail; if (unlikely((u32)cqe < n)) { spin_unlock_irq(&cq->lock); vfree(wc); ret = -EOVERFLOW; goto bail; } - for (n = 0; cq->tail != cq->head; n++) { - wc[n] = cq->queue[cq->tail]; - if (cq->tail == cq->ibcq.cqe) - cq->tail = 0; + for (n = 0; tail != head; n++) { + wc->queue[n] = old_wc->queue[tail]; + if (tail == (u32) cq->ibcq.cqe) + tail = 0; else - cq->tail++; + tail++; } cq->ibcq.cqe = cqe; - cq->head = n; - cq->tail = 0; - old_wc = cq->queue; + wc->head = n; + wc->tail = 0; cq->queue = wc; spin_unlock_irq(&cq->lock); vfree(old_wc); + if (cq->ip) { + struct ipath_ibdev *dev = to_idev(ibcq->device); + struct ipath_mmap_info *ip = cq->ip; + + ip->obj = wc; + ip->size = PAGE_ALIGN(sizeof(*wc) + + sizeof(struct ib_wc) * cqe); + spin_lock_irq(&dev->pending_lock); + ip->next = dev->pending_mmaps; + dev->pending_mmaps = ip; + spin_unlock_irq(&dev->pending_lock); + } + ret = 0; bail: Index: src/linux-kernel/infiniband/hw/ipath/ipath_srq.c =================================================================== --- src/linux-kernel/infiniband/hw/ipath/ipath_srq.c (revision 8021) +++ src/linux-kernel/infiniband/hw/ipath/ipath_srq.c (working copy) @@ -47,66 +47,38 @@ struct ib_recv_wr **bad_wr) { struct ipath_srq *srq = to_isrq(ibsrq); - struct ipath_ibdev *dev = to_idev(ibsrq->device); + struct ipath_rwq *wq; unsigned long flags; int ret; for (; wr; wr = wr->next) { struct ipath_rwqe *wqe; u32 next; - int i, j; + int i; - if (wr->num_sge > srq->rq.max_sge) { + if ((unsigned) wr->num_sge > srq->rq.max_sge) { *bad_wr = wr; ret = -ENOMEM; goto bail; } spin_lock_irqsave(&srq->rq.lock, flags); - next = srq->rq.head + 1; + wq = srq->rq.wq; + next = wq->head + 1; if (next >= srq->rq.size) next = 0; - if (next == srq->rq.tail) { + if (next == wq->tail) { spin_unlock_irqrestore(&srq->rq.lock, flags); *bad_wr = wr; ret = -ENOMEM; goto bail; } - wqe = get_rwqe_ptr(&srq->rq, srq->rq.head); + wqe = get_rwqe_ptr(&srq->rq, wq->head); wqe->wr_id = wr->wr_id; - wqe->sg_list[0].mr = NULL; - wqe->sg_list[0].vaddr = NULL; - wqe->sg_list[0].length = 0; - wqe->sg_list[0].sge_length = 0; - wqe->length = 0; - for (i = 0, j = 0; i < wr->num_sge; i++) { - /* Check LKEY */ - if (to_ipd(srq->ibsrq.pd)->user && - wr->sg_list[i].lkey == 0) { - spin_unlock_irqrestore(&srq->rq.lock, - flags); - *bad_wr = wr; - ret = -EINVAL; - goto bail; - } - if (wr->sg_list[i].length == 0) - continue; - if (!ipath_lkey_ok(&dev->lk_table, - &wqe->sg_list[j], - &wr->sg_list[i], - IB_ACCESS_LOCAL_WRITE)) { - spin_unlock_irqrestore(&srq->rq.lock, - flags); - *bad_wr = wr; - ret = -EINVAL; - goto bail; - } - wqe->length += wr->sg_list[i].length; - j++; - } - wqe->num_sge = j; - srq->rq.head = next; + for (i = 0; i < wr->num_sge; i++) + wqe->sg_list[0] = wr->sg_list[i]; + wq->head = next; spin_unlock_irqrestore(&srq->rq.lock, flags); } ret = 0; @@ -156,28 +128,67 @@ * Need to use vmalloc() if we want to support large #s of entries. */ srq->rq.size = srq_init_attr->attr.max_wr + 1; - sz = sizeof(struct ipath_sge) * srq_init_attr->attr.max_sge + + srq->rq.max_sge = srq_init_attr->attr.max_sge; + sz = sizeof(struct ib_sge) * srq->rq.max_sge + sizeof(struct ipath_rwqe); - srq->rq.wq = vmalloc(srq->rq.size * sz); + srq->rq.wq = vmalloc(sizeof(struct ipath_rwq) + srq->rq.size * sz); if (!srq->rq.wq) { - kfree(srq); ret = ERR_PTR(-ENOMEM); - goto bail; + goto free_srq; } /* + * Return the address of the RWQ as the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata) { + struct ipath_mmap_info *ip; + __u64 offset = (__u64) srq->rq.wq; + int err; + + err = ib_copy_to_udata(udata, &offset, sizeof(offset)); + if (err) { + ret = ERR_PTR(err); + goto free_rwq; + } + + /* Allocate info for ipath_mmap(). */ + ip = kmalloc(sizeof(*ip), GFP_KERNEL); + if (!ip) { + ret = ERR_PTR(-ENOMEM); + goto free_rwq; + } + srq->ip = ip; + ip->context = ibpd->uobject->context; + ip->obj = srq->rq.wq; + kref_init(&ip->ref); + ip->mmap_cnt = 0; + ip->size = PAGE_ALIGN(sizeof(struct ipath_rwq) + + srq->rq.size * sz); + spin_lock_irq(&dev->pending_lock); + ip->next = dev->pending_mmaps; + dev->pending_mmaps = ip; + spin_unlock_irq(&dev->pending_lock); + } else + srq->ip = NULL; + + /* * ib_create_srq() will initialize srq->ibsrq. */ spin_lock_init(&srq->rq.lock); - srq->rq.head = 0; - srq->rq.tail = 0; - srq->rq.max_sge = srq_init_attr->attr.max_sge; + srq->rq.wq->head = 0; + srq->rq.wq->tail = 0; srq->limit = srq_init_attr->attr.srq_limit; + dev->n_srqs_allocated++; + ret = &srq->ibsrq; + goto bail; - dev->n_srqs_allocated++; - +free_rwq: + vfree(srq->rq.wq); +free_srq: + kfree(srq); bail: return ret; } @@ -187,83 +198,143 @@ * @ibsrq: the SRQ to modify * @attr: the new attributes of the SRQ * @attr_mask: indicates which attributes to modify + * @udata: user data for ipathverbs.so */ int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, - enum ib_srq_attr_mask attr_mask) + enum ib_srq_attr_mask attr_mask, + struct ib_udata *udata) { struct ipath_srq *srq = to_isrq(ibsrq); - unsigned long flags; - int ret; + int ret = 0; - if (attr_mask & IB_SRQ_MAX_WR) - if ((attr->max_wr > ib_ipath_max_srq_wrs) || - (attr->max_sge > srq->rq.max_sge)) { + if (attr_mask & IB_SRQ_MAX_WR) { + struct ipath_rwq *owq; + struct ipath_rwq *wq; + struct ipath_rwqe *p; + u32 sz, size, n, head, tail; + + /* Don't allow resize if mmapped */ + if (srq->ip && srq->ip->mmap_cnt) { ret = -EINVAL; goto bail; } - if (attr_mask & IB_SRQ_LIMIT) - if (attr->srq_limit >= srq->rq.size) { + /* + * Check that the requested sizes are below the limits + * and that user/kernel SRQs are only resized by the + * user/kernel. + */ + if ((attr->max_wr > ib_ipath_max_srq_wrs) || + (!udata != !srq->ip) || + ((attr_mask & IB_SRQ_LIMIT) && + attr->srq_limit > attr->max_wr) || + (!(attr_mask & IB_SRQ_LIMIT) && + srq->limit > attr->max_wr)) { ret = -EINVAL; goto bail; } - if (attr_mask & IB_SRQ_MAX_WR) { - struct ipath_rwqe *wq, *p; - u32 sz, size, n; - sz = sizeof(struct ipath_rwqe) + - attr->max_sge * sizeof(struct ipath_sge); + srq->rq.max_sge * sizeof(struct ib_sge); size = attr->max_wr + 1; - wq = vmalloc(size * sz); + wq = vmalloc(sizeof(struct ipath_rwq) + size * sz); if (!wq) { ret = -ENOMEM; goto bail; } - spin_lock_irqsave(&srq->rq.lock, flags); - if (srq->rq.head < srq->rq.tail) - n = srq->rq.size + srq->rq.head - srq->rq.tail; + /* + * Return the address of the RWQ as the offset to mmap. + * See ipath_mmap() for details. + */ + if (udata) { + __u64 offset_addr; + __u64 offset = (__u64) wq; + + ret = ib_copy_from_udata(&offset_addr, udata, + sizeof(offset_addr)); + if (ret) { + vfree(wq); + goto bail; + } + udata->outbuf = (void __user *) offset_addr; + ret = ib_copy_to_udata(udata, &offset, + sizeof(offset)); + if (ret) { + vfree(wq); + goto bail; + } + } + + spin_lock_irq(&srq->rq.lock); + /* + * validate head pointer value and compute + * the number of remaining WQEs. + */ + owq = srq->rq.wq; + head = owq->head; + if (head >= srq->rq.size) + head = 0; + tail = owq->tail; + if (tail >= srq->rq.size) + tail = 0; + n = head; + if (n < tail) + n += srq->rq.size - tail; else - n = srq->rq.head - srq->rq.tail; - if (size <= n || size <= srq->limit) { - spin_unlock_irqrestore(&srq->rq.lock, flags); + n -= tail; + if (size <= n) { + spin_unlock_irq(&srq->rq.lock); vfree(wq); ret = -EINVAL; goto bail; } n = 0; - p = wq; - while (srq->rq.tail != srq->rq.head) { + p = wq->wq; + while (tail != head) { struct ipath_rwqe *wqe; int i; - wqe = get_rwqe_ptr(&srq->rq, srq->rq.tail); + wqe = get_rwqe_ptr(&srq->rq, tail); p->wr_id = wqe->wr_id; - p->length = wqe->length; p->num_sge = wqe->num_sge; for (i = 0; i < wqe->num_sge; i++) p->sg_list[i] = wqe->sg_list[i]; n++; p = (struct ipath_rwqe *)((char *) p + sz); - if (++srq->rq.tail >= srq->rq.size) - srq->rq.tail = 0; + if (++tail >= srq->rq.size) + tail = 0; } - vfree(srq->rq.wq); srq->rq.wq = wq; srq->rq.size = size; - srq->rq.head = n; - srq->rq.tail = 0; - srq->rq.max_sge = attr->max_sge; - spin_unlock_irqrestore(&srq->rq.lock, flags); - } + wq->head = n; + wq->tail = 0; + if (attr_mask & IB_SRQ_LIMIT) + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); - if (attr_mask & IB_SRQ_LIMIT) { - spin_lock_irqsave(&srq->rq.lock, flags); - srq->limit = attr->srq_limit; - spin_unlock_irqrestore(&srq->rq.lock, flags); + vfree(owq); + + if (srq->ip) { + struct ipath_mmap_info *ip = srq->ip; + struct ipath_ibdev *dev = to_idev(srq->ibsrq.device); + + ip->obj = wq; + ip->size = PAGE_ALIGN(sizeof(struct ipath_rwq) + + size * sz); + spin_lock_irq(&dev->pending_lock); + ip->next = dev->pending_mmaps; + dev->pending_mmaps = ip; + spin_unlock_irq(&dev->pending_lock); + } + } else if (attr_mask & IB_SRQ_LIMIT) { + spin_lock_irq(&srq->rq.lock); + if (attr->srq_limit >= srq->rq.size) + ret = -EINVAL; + else + srq->limit = attr->srq_limit; + spin_unlock_irq(&srq->rq.lock); } - ret = 0; bail: return ret; @@ -289,7 +360,10 @@ struct ipath_ibdev *dev = to_idev(ibsrq->device); dev->n_srqs_allocated--; - vfree(srq->rq.wq); + if (srq->ip) + kref_put(&srq->ip->ref, ipath_release_mmap_info); + else + vfree(srq->rq.wq); kfree(srq); return 0; Index: src/linux-kernel/infiniband/hw/ipath/ipath_ud.c =================================================================== --- src/linux-kernel/infiniband/hw/ipath/ipath_ud.c (revision 8021) +++ src/linux-kernel/infiniband/hw/ipath/ipath_ud.c (working copy) @@ -35,6 +35,53 @@ #include "ipath_verbs.h" #include "ips_common.h" +static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe, + u32 *lengthp, struct ipath_sge_state *ss) +{ + struct ipath_ibdev *dev = to_idev(qp->ibqp.device); + int user = to_ipd(qp->ibqp.pd)->user; + int i, j, ret; + struct ib_wc wc; + + *lengthp = 0; + for (i = j = 0; i < wqe->num_sge; i++) { + if (wqe->sg_list[i].length == 0) + continue; + /* Check LKEY */ + if ((user && wqe->sg_list[i].lkey == 0) || + !ipath_lkey_ok(&dev->lk_table, + j ? &ss->sg_list[j - 1] : &ss->sge, + &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) + goto bad_lkey; + *lengthp += wqe->sg_list[i].length; + j++; + } + ss->num_sge = j; + ret = 1; + goto bail; + +bad_lkey: + wc.wr_id = wqe->wr_id; + wc.status = IB_WC_LOC_PROT_ERR; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.byte_len = 0; + wc.imm_data = 0; + wc.qp_num = qp->ibqp.qp_num; + wc.src_qp = 0; + wc.wc_flags = 0; + wc.pkey_index = 0; + wc.slid = 0; + wc.sl = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal solicited completion event. */ + ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1); + ret = 0; +bail: + return ret; +} + /** * ipath_ud_loopback - handle send on loopback QPs * @sqp: the QP @@ -45,6 +92,8 @@ * * This is called from ipath_post_ud_send() to forward a WQE addressed * to the same HCA. + * Note that the receive interrupt handler may be calling ipath_ud_rcv() + * while this is being called. */ static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_sge_state *ss, @@ -59,7 +108,11 @@ struct ipath_srq *srq; struct ipath_sge_state rsge; struct ipath_sge *sge; + struct ipath_rwq *wq; struct ipath_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + u32 tail; + u32 rlen; qp = ipath_lookup_qpn(&dev->qp_table, wr->wr.ud.remote_qpn); if (!qp) @@ -93,6 +146,13 @@ wc->imm_data = 0; } + if (wr->num_sge > 1) { + rsge.sg_list = kmalloc((wr->num_sge - 1) * + sizeof(struct ipath_sge), + GFP_ATOMIC); + } else + rsge.sg_list = NULL; + /* * Get the next work request entry to find where to put the data. * Note that it is safe to drop the lock after changing rq->tail @@ -100,37 +160,52 @@ */ if (qp->ibqp.srq) { srq = to_isrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; rq = &srq->rq; } else { srq = NULL; + handler = NULL; rq = &qp->r_rq; } + spin_lock_irqsave(&rq->lock, flags); - if (rq->tail == rq->head) { - spin_unlock_irqrestore(&rq->lock, flags); - dev->n_pkt_drops++; - goto done; + wq = rq->wq; + tail = wq->tail; + while (1) { + if (unlikely(tail == wq->head)) { + spin_unlock_irqrestore(&rq->lock, flags); + dev->n_pkt_drops++; + goto free_sge; + } + wqe = get_rwqe_ptr(rq, tail); + if (++tail >= rq->size) + tail = 0; + if (init_sge(qp, wqe, &rlen, &rsge)) + break; + wq->tail = tail; } /* Silently drop packets which are too big. */ - wqe = get_rwqe_ptr(rq, rq->tail); - if (wc->byte_len > wqe->length) { + if (wc->byte_len > rlen) { spin_unlock_irqrestore(&rq->lock, flags); dev->n_pkt_drops++; - goto done; + goto free_sge; } + wq->tail = tail; wc->wr_id = wqe->wr_id; - rsge.sge = wqe->sg_list[0]; - rsge.sg_list = wqe->sg_list + 1; - rsge.num_sge = wqe->num_sge; - if (++rq->tail >= rq->size) - rq->tail = 0; - if (srq && srq->ibsrq.event_handler) { + if (handler) { u32 n; - if (rq->head < rq->tail) - n = rq->size + rq->head - rq->tail; + /* + * validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; else - n = rq->head - rq->tail; + n -= tail; if (n < srq->limit) { struct ib_event ev; @@ -139,12 +214,12 @@ ev.device = qp->ibqp.device; ev.element.srq = qp->ibqp.srq; ev.event = IB_EVENT_SRQ_LIMIT_REACHED; - srq->ibsrq.event_handler(&ev, - srq->ibsrq.srq_context); + handler(&ev, srq->ibsrq.srq_context); } else spin_unlock_irqrestore(&rq->lock, flags); } else spin_unlock_irqrestore(&rq->lock, flags); + ah_attr = &to_iah(wr->wr.ud.ah)->attr; if (ah_attr->ah_flags & IB_AH_GRH) { ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh)); @@ -195,6 +270,8 @@ ipath_cq_enter(to_icq(qp->ibqp.recv_cq), wc, wr->send_flags & IB_SEND_SOLICITED); +free_sge: + kfree(rsge.sg_list); done: if (atomic_dec_and_test(&qp->refcount)) wake_up(&qp->wait); @@ -432,13 +509,9 @@ int opcode; u32 hdrsize; u32 pad; - unsigned long flags; struct ib_wc wc; u32 qkey; u32 src_qp; - struct ipath_rq *rq; - struct ipath_srq *srq; - struct ipath_rwqe *wqe; u16 dlid; int header_in_data; @@ -546,19 +619,10 @@ /* * Get the next work request entry to find where to put the data. - * Note that it is safe to drop the lock after changing rq->tail - * since ipath_post_receive() won't fill the empty slot. */ - if (qp->ibqp.srq) { - srq = to_isrq(qp->ibqp.srq); - rq = &srq->rq; - } else { - srq = NULL; - rq = &qp->r_rq; - } - spin_lock_irqsave(&rq->lock, flags); - if (rq->tail == rq->head) { - spin_unlock_irqrestore(&rq->lock, flags); + if (qp->r_reuse_sge) + qp->r_reuse_sge = 0; + else if (!ipath_get_rwqe(qp, 0)) { /* * Count VL15 packets dropped due to no receive buffer. * Otherwise, count them as buffer overruns since usually, @@ -572,39 +636,11 @@ goto bail; } /* Silently drop packets which are too big. */ - wqe = get_rwqe_ptr(rq, rq->tail); - if (wc.byte_len > wqe->length) { - spin_unlock_irqrestore(&rq->lock, flags); + if (wc.byte_len > qp->r_len) { + qp->r_reuse_sge = 1; dev->n_pkt_drops++; goto bail; } - wc.wr_id = wqe->wr_id; - qp->r_sge.sge = wqe->sg_list[0]; - qp->r_sge.sg_list = wqe->sg_list + 1; - qp->r_sge.num_sge = wqe->num_sge; - if (++rq->tail >= rq->size) - rq->tail = 0; - if (srq && srq->ibsrq.event_handler) { - u32 n; - - if (rq->head < rq->tail) - n = rq->size + rq->head - rq->tail; - else - n = rq->head - rq->tail; - if (n < srq->limit) { - struct ib_event ev; - - srq->limit = 0; - spin_unlock_irqrestore(&rq->lock, flags); - ev.device = qp->ibqp.device; - ev.element.srq = qp->ibqp.srq; - ev.event = IB_EVENT_SRQ_LIMIT_REACHED; - srq->ibsrq.event_handler(&ev, - srq->ibsrq.srq_context); - } else - spin_unlock_irqrestore(&rq->lock, flags); - } else - spin_unlock_irqrestore(&rq->lock, flags); if (has_grh) { ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh, sizeof(struct ib_grh)); @@ -613,6 +649,7 @@ ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh)); ipath_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh)); + wc.wr_id = qp->r_wr_id; wc.status = IB_WC_SUCCESS; wc.opcode = IB_WC_RECV; wc.vendor_err = 0; -- Ralph Campbell <[EMAIL PROTECTED]> _______________________________________________ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general