With the following in place we are able to perform very large RMPP
transfers. Please comment.

---

Modify the rmpp mad support to accept a linked list of segments
instead of a large physically contigious buffer.
The list is kept in mad_send_wr private data and constructed with
new ib_append_to_multipacket_mad API call.
Modify user_mad.c to allocate large MADs for send/receive by chunks.

Signed-off-by: Jack Morgenstein <[EMAIL PROTECTED]>
Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]>

Index: latest/drivers/infiniband/core/mad_rmpp.c
===================================================================
--- latest.orig/drivers/infiniband/core/mad_rmpp.c
+++ latest/drivers/infiniband/core/mad_rmpp.c
@@ -433,44 +433,6 @@ static struct ib_mad_recv_wc * complete_
        return rmpp_wc;
 }
 
-void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc, void *buf)
-{
-       struct ib_mad_recv_buf *seg_buf;
-       struct ib_rmpp_mad *rmpp_mad;
-       void *data;
-       int size, len, offset;
-       u8 flags;
-
-       len = mad_recv_wc->mad_len;
-       if (len <= sizeof(struct ib_mad)) {
-               memcpy(buf, mad_recv_wc->recv_buf.mad, len);
-               return;
-       }
-
-       offset = data_offset(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
-
-       list_for_each_entry(seg_buf, &mad_recv_wc->rmpp_list, list) {
-               rmpp_mad = (struct ib_rmpp_mad *)seg_buf->mad;
-               flags = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr);
-
-               if (flags & IB_MGMT_RMPP_FLAG_FIRST) {
-                       data = rmpp_mad;
-                       size = sizeof(*rmpp_mad);
-               } else {
-                       data = (void *) rmpp_mad + offset;
-                       if (flags & IB_MGMT_RMPP_FLAG_LAST)
-                               size = len;
-                       else
-                               size = sizeof(*rmpp_mad) - offset;
-               }
-
-               memcpy(buf, data, size);
-               len -= size;
-               buf += size;
-       }
-}
-EXPORT_SYMBOL(ib_coalesce_recv_mad);
-
 static struct ib_mad_recv_wc *
 continue_rmpp(struct ib_mad_agent_private *agent,
              struct ib_mad_recv_wc *mad_recv_wc)
@@ -570,16 +532,26 @@ start_rmpp(struct ib_mad_agent_private *
        return mad_recv_wc;
 }
 
-static inline u64 get_seg_addr(struct ib_mad_send_wr_private *mad_send_wr)
+static inline void * get_seg_addr(struct ib_mad_send_wr_private *mad_send_wr)
 {
-       return mad_send_wr->sg_list[0].addr + mad_send_wr->data_offset +
-              (sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset) *
-              (mad_send_wr->seg_num - 1);
+       struct ib_mad_multipacket_seg *seg;
+       int i = 2;
+
+       if (list_empty(&mad_send_wr->multipacket_list))
+               return NULL;
+
+       list_for_each_entry(seg, &mad_send_wr->multipacket_list, list) {
+               if (i == mad_send_wr->seg_num)
+                       return seg->data;
+               i++;
+       }
+       return NULL;
 }
 
-static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
+int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
 {
        struct ib_rmpp_mad *rmpp_mad;
+       void *next_data;
        int timeout;
        u32 paylen;
 
@@ -594,12 +566,14 @@ static int send_next_seg(struct ib_mad_s
                rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
                mad_send_wr->sg_list[0].length = sizeof(struct ib_rmpp_mad);
        } else {
-               mad_send_wr->send_wr.num_sge = 2;
-               mad_send_wr->sg_list[0].length = mad_send_wr->data_offset;
-               mad_send_wr->sg_list[1].addr = get_seg_addr(mad_send_wr);
-               mad_send_wr->sg_list[1].length = sizeof(struct ib_rmpp_mad) -
-                                                mad_send_wr->data_offset;
-               mad_send_wr->sg_list[1].lkey = mad_send_wr->sg_list[0].lkey;
+               next_data = get_seg_addr(mad_send_wr);
+               if (!next_data) {
+                       printk(KERN_ERR PFX "send_next_seg: "
+                              "could not find next segment\n");
+                       return -EINVAL;
+               }
+               memcpy((void *)rmpp_mad + mad_send_wr->data_offset, next_data,
+                      sizeof(struct ib_rmpp_mad) - mad_send_wr->data_offset);
                rmpp_mad->rmpp_hdr.paylen_newwin = 0;
        }
 
Index: latest/drivers/infiniband/include/rdma/ib_mad.h
===================================================================
--- latest.orig/drivers/infiniband/include/rdma/ib_mad.h
+++ latest/drivers/infiniband/include/rdma/ib_mad.h
@@ -141,6 +141,11 @@ struct ib_rmpp_hdr {
        __be32  paylen_newwin;
 };
 
+struct ib_mad_multipacket_seg {
+       struct list_head list;
+       u8 data[0];
+};
+
 typedef u64 __bitwise ib_sa_comp_mask;
 
 #define IB_SA_COMP_MASK(n) ((__force ib_sa_comp_mask) cpu_to_be64(1ull << n))
@@ -485,17 +490,6 @@ int ib_unregister_mad_agent(struct ib_ma
 int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
                     struct ib_mad_send_buf **bad_send_buf);
 
-/**
- * ib_coalesce_recv_mad - Coalesces received MAD data into a single buffer.
- * @mad_recv_wc: Work completion information for a received MAD.
- * @buf: User-provided data buffer to receive the coalesced buffers.  The
- *   referenced buffer should be at least the size of the mad_len specified
- *   by @mad_recv_wc.
- *
- * This call copies a chain of received MAD segments into a single data buffer,
- * removing duplicated headers.
- */
-void ib_coalesce_recv_mad(struct ib_mad_recv_wc *mad_recv_wc, void *buf);
 
 /**
  * ib_free_recv_mad - Returns data buffers used to receive a MAD.
@@ -601,6 +595,18 @@ struct ib_mad_send_buf * ib_create_send_
                                            gfp_t gfp_mask);
 
 /**
+ * ib_append_to_multipacket_mad - Append a segment of an RMPP multipacket mad 
send
+ *   to the send buffer.
+ * @send_buf: Previously allocated send data buffer.
+ * @seg: segment to append to linked list (already filled with data).
+ *
+ * This routine appends a segment of a multipacket RMPP message
+ * (copied from user space) to a MAD for sending.
+ */
+void ib_append_to_multipacket_mad(struct ib_mad_send_buf * send_buf,
+                                 struct ib_mad_multipacket_seg *seg);
+
+/**
  * ib_free_send_mad - Returns data buffers used to send a MAD.
  * @send_buf: Previously allocated send data buffer.
  */
Index: latest/drivers/infiniband/core/mad.c
===================================================================
--- latest.orig/drivers/infiniband/core/mad.c
+++ latest/drivers/infiniband/core/mad.c
@@ -792,17 +792,13 @@ struct ib_mad_send_buf * ib_create_send_
                return ERR_PTR(-EINVAL);
 
        length = sizeof *mad_send_wr + buf_size;
-       if (length >= PAGE_SIZE)
-               buf = (void *)__get_free_pages(gfp_mask, 
long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-       else
-               buf = kmalloc(length, gfp_mask);
+       buf = kzalloc(sizeof *mad_send_wr + sizeof(struct ib_mad), gfp_mask);
 
        if (!buf)
                return ERR_PTR(-ENOMEM);
 
-       memset(buf, 0, length);
-
-       mad_send_wr = buf + buf_size;
+       mad_send_wr = buf + sizeof(struct ib_mad);
+       INIT_LIST_HEAD(&mad_send_wr->multipacket_list);
        mad_send_wr->send_buf.mad = buf;
 
        mad_send_wr->mad_agent_priv = mad_agent_priv;
@@ -834,23 +830,33 @@ struct ib_mad_send_buf * ib_create_send_
 }
 EXPORT_SYMBOL(ib_create_send_mad);
 
+void ib_append_to_multipacket_mad(struct ib_mad_send_buf * send_buf,
+                               struct ib_mad_multipacket_seg *seg)
+{
+       struct ib_mad_send_wr_private *mad_send_wr;
+
+       mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+                                  send_buf);
+       list_add_tail(&seg->list, &mad_send_wr->multipacket_list);
+}
+EXPORT_SYMBOL(ib_append_to_multipacket_mad);
+
 void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
 {
        struct ib_mad_agent_private *mad_agent_priv;
-       void *mad_send_wr;
-       int length;
+       struct ib_mad_send_wr_private *mad_send_wr;
+       struct ib_mad_multipacket_seg *seg, *tmp;
 
        mad_agent_priv = container_of(send_buf->mad_agent,
                                      struct ib_mad_agent_private, agent);
        mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
                                   send_buf);
 
-       length = sizeof(struct ib_mad_send_wr_private) + (mad_send_wr - 
send_buf->mad);
-       if (length >= PAGE_SIZE)
-               free_pages((unsigned long)send_buf->mad, 
long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-       else
-               kfree(send_buf->mad);
-
+       list_for_each_entry_safe(seg, tmp, &mad_send_wr->multipacket_list, 
list) {
+               list_del(&seg->list);
+               kfree(seg);
+       }
+       kfree(send_buf->mad);
        if (atomic_dec_and_test(&mad_agent_priv->refcount))
                wake_up(&mad_agent_priv->wait);
 }
Index: latest/drivers/infiniband/core/mad_priv.h
===================================================================
--- latest.orig/drivers/infiniband/core/mad_priv.h
+++ latest/drivers/infiniband/core/mad_priv.h
@@ -130,6 +130,7 @@ struct ib_mad_send_wr_private {
        enum ib_wc_status status;
 
        /* RMPP control */
+       struct list_head multipacket_list;
        int last_ack;
        int seg_num;
        int newwin;
Index: latest/drivers/infiniband/core/user_mad.c
===================================================================
--- latest.orig/drivers/infiniband/core/user_mad.c
+++ latest/drivers/infiniband/core/user_mad.c
@@ -123,6 +123,7 @@ struct ib_umad_packet {
        struct ib_mad_send_buf *msg;
        struct list_head   list;
        int                length;
+       struct list_head   seg_list;
        struct ib_user_mad mad;
 };
 
@@ -176,6 +177,87 @@ static int queue_packet(struct ib_umad_f
        return ret;
 }
 
+static int data_offset(u8 mgmt_class)
+{
+       if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+               return IB_MGMT_SA_HDR;
+       else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+                (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+               return IB_MGMT_VENDOR_HDR;
+       else
+               return IB_MGMT_RMPP_HDR;
+}
+
+static int copy_recv_mad(struct ib_mad_recv_wc *mad_recv_wc,
+                         struct ib_umad_packet *packet)
+{
+       struct ib_mad_recv_buf *seg_buf;
+       struct ib_rmpp_mad *rmpp_mad;
+       void *data;
+       struct ib_mad_multipacket_seg *seg;
+       int size, len, offset;
+       u8 flags;
+
+       len = mad_recv_wc->mad_len;
+       if (len <= sizeof(struct ib_mad)) {
+               memcpy(&packet->mad.data, mad_recv_wc->recv_buf.mad, len);
+               return 0;
+       }
+
+       offset = data_offset(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+
+       list_for_each_entry(seg_buf, &mad_recv_wc->rmpp_list, list) {
+               rmpp_mad = (struct ib_rmpp_mad *)seg_buf->mad;
+               flags = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr);
+
+               if (flags & IB_MGMT_RMPP_FLAG_FIRST) {
+                       size = sizeof(*rmpp_mad);
+                       memcpy(&packet->mad.data, rmpp_mad, size);
+               } else {
+                       data = (void *) rmpp_mad + offset;
+                       if (flags & IB_MGMT_RMPP_FLAG_LAST)
+                               size = len;
+                       else
+                               size = sizeof(*rmpp_mad) - offset;
+                       seg = kmalloc(sizeof(struct ib_mad_multipacket_seg) +
+                                     sizeof(struct ib_rmpp_mad) - offset,
+                                     GFP_KERNEL);
+                       if (!seg)
+                               return -ENOMEM;
+                       memcpy(seg->data, data, size);
+                       list_add_tail(&seg->list, &packet->seg_list);
+               }
+               len -= size;
+       }
+       return 0;
+}
+
+static struct ib_umad_packet *alloc_packet(void)
+{
+       struct ib_umad_packet *packet;
+       int length = sizeof *packet + sizeof(struct ib_mad);
+
+       packet = kzalloc(length, GFP_KERNEL);
+       if (!packet) {
+               printk(KERN_ERR "alloc_packet: mem alloc failed for length 
%d\n",
+                      length);
+               return NULL;
+       }
+       INIT_LIST_HEAD(&packet->seg_list);
+       return packet;
+}
+
+static void free_packet(struct ib_umad_packet *packet)
+{
+       struct ib_mad_multipacket_seg *seg, *tmp;
+
+       list_for_each_entry_safe(seg, tmp, &packet->seg_list, list) {
+               list_del(&seg->list);
+               kfree(seg);
+       }
+       kfree(packet);
+}
+
 static void send_handler(struct ib_mad_agent *agent,
                         struct ib_mad_send_wc *send_wc)
 {
@@ -187,7 +269,7 @@ static void send_handler(struct ib_mad_a
        ib_free_send_mad(packet->msg);
 
        if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
-               timeout = kzalloc(sizeof *timeout + IB_MGMT_MAD_HDR, 
GFP_KERNEL);
+               timeout = alloc_packet();
                if (!timeout)
                        goto out;
 
@@ -198,40 +280,14 @@ static void send_handler(struct ib_mad_a
                       sizeof (struct ib_mad_hdr));
 
                if (!queue_packet(file, agent, timeout))
-                               return;
+                       return;
+               else
+                       free_packet(timeout);
        }
 out:
        kfree(packet);
 }
 
-static struct ib_umad_packet *alloc_packet(int buf_size)
-{
-       struct ib_umad_packet *packet;
-       int length = sizeof *packet + buf_size;
-
-       if (length >= PAGE_SIZE)
-               packet = (void *)__get_free_pages(GFP_KERNEL, 
long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-       else
-               packet = kmalloc(length, GFP_KERNEL);
-
-       if (!packet)
-               return NULL;
-
-       memset(packet, 0, length);
-       return packet;
-}
-
-static void free_packet(struct ib_umad_packet *packet)
-{
-       int length = packet->length + sizeof *packet;
-       if (length >= PAGE_SIZE)
-               free_pages((unsigned long) packet, 
long_log2(roundup_pow_of_two(length)) - PAGE_SHIFT);
-       else
-               kfree(packet);
-}
-
-
-
 static void recv_handler(struct ib_mad_agent *agent,
                         struct ib_mad_recv_wc *mad_recv_wc)
 {
@@ -243,13 +299,16 @@ static void recv_handler(struct ib_mad_a
                goto out;
 
        length = mad_recv_wc->mad_len;
-       packet = alloc_packet(length);
+       packet = alloc_packet();
        if (!packet)
                goto out;
 
        packet->length = length;
 
-       ib_coalesce_recv_mad(mad_recv_wc, packet->mad.data);
+       if (copy_recv_mad(mad_recv_wc, packet)) {
+               free_packet(packet);
+               goto out;
+       }
 
        packet->mad.hdr.status    = 0;
        packet->mad.hdr.length    = length + sizeof (struct ib_user_mad);
@@ -278,6 +337,7 @@ static ssize_t ib_umad_read(struct file 
                            size_t count, loff_t *pos)
 {
        struct ib_umad_file *file = filp->private_data;
+       struct ib_mad_multipacket_seg *seg;
        struct ib_umad_packet *packet;
        ssize_t ret;
 
@@ -304,18 +364,42 @@ static ssize_t ib_umad_read(struct file 
 
        spin_unlock_irq(&file->recv_lock);
 
-       if (count < packet->length + sizeof (struct ib_user_mad)) {
-               /* Return length needed (and first RMPP segment) if too small */
-               if (copy_to_user(buf, &packet->mad,
-                                sizeof (struct ib_user_mad) + sizeof (struct 
ib_mad)))
-                       ret = -EFAULT;
-               else
-                       ret = -ENOSPC;
-       } else if (copy_to_user(buf, &packet->mad,
-                               packet->length + sizeof (struct ib_user_mad)))
+       if (copy_to_user(buf, &packet->mad,
+                        sizeof(struct ib_user_mad) + sizeof(struct ib_mad))) {
                ret = -EFAULT;
-       else
+               goto err;
+       }
+
+       if (count < packet->length + sizeof (struct ib_user_mad))
+               /* User buffer too small. Return first RMPP segment (which
+                * includes RMPP message length).
+                */
+               ret = -ENOSPC;
+       else if (packet->length <= sizeof(struct ib_mad))
+               ret = packet->length + sizeof(struct ib_user_mad);
+       else {
+               int len = packet->length - sizeof(struct ib_mad);
+               struct ib_rmpp_mad *rmpp_mad =
+                               (struct ib_rmpp_mad *) packet->mad.data;
+               int max_seg_payload = sizeof(struct ib_mad) -
+                                     data_offset(rmpp_mad->mad_hdr.mgmt_class);
+               int seg_payload;
+               /* multipacket RMPP MAD message. Copy remainder of message.
+                * Note that last segment may have a shorter payload.
+                */
+               buf += sizeof(struct ib_user_mad) + sizeof(struct ib_mad);
+               list_for_each_entry(seg, &packet->seg_list, list) {
+                       seg_payload = min_t(int, len, max_seg_payload);
+                       if (copy_to_user(buf, seg->data, seg_payload)) {
+                               ret = -EFAULT;
+                               goto err;
+                       }
+                       buf += seg_payload;
+                       len -= seg_payload;
+               }
                ret = packet->length + sizeof (struct ib_user_mad);
+       }
+err:
        if (ret < 0) {
                /* Requeue packet */
                spin_lock_irq(&file->recv_lock);
@@ -339,6 +423,8 @@ static ssize_t ib_umad_write(struct file
        __be64 *tid;
        int ret, length, hdr_len, copy_offset;
        int rmpp_active, has_rmpp_header;
+       int max_seg_payload;
+       struct ib_mad_multipacket_seg *seg;
 
        if (count < sizeof (struct ib_user_mad) + IB_MGMT_RMPP_HDR)
                return -EINVAL;
@@ -415,6 +501,11 @@ static ssize_t ib_umad_write(struct file
                goto err_ah;
        }
 
+       if (!rmpp_active && length > sizeof(struct ib_mad)) {
+               ret = -EINVAL;
+               goto err_ah;
+       }
+
        packet->msg = ib_create_send_mad(agent,
                                         be32_to_cpu(packet->mad.hdr.qpn),
                                         0, rmpp_active,
@@ -432,12 +523,39 @@ static ssize_t ib_umad_write(struct file
 
        /* Copy MAD headers (RMPP header in place) */
        memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
-       /* Now, copy rest of message from user into send buffer */
+       /* complete copying first 256 bytes of message into send buffer */
        if (copy_from_user(packet->msg->mad + copy_offset,
                           buf + sizeof (struct ib_user_mad) + copy_offset,
-                          length - copy_offset)) {
+                          min_t(int, length, sizeof(struct ib_mad)) - 
copy_offset)) {
                ret = -EFAULT;
-               goto err_msg;
+               goto err_ah;
+       }
+
+       /* if multipacket, copy remainder of send message from user to 
multipacket list */
+       length -= sizeof(struct ib_mad);
+       buf +=  sizeof (struct ib_user_mad) + sizeof(struct ib_mad);
+       max_seg_payload = sizeof(struct ib_mad) -
+                         data_offset(rmpp_mad->mad_hdr.mgmt_class);
+       while (length > 0) {
+               int seg_payload = min_t(int, length, max_seg_payload);
+               seg = kzalloc(sizeof(struct ib_mad_multipacket_seg) +
+                             max_seg_payload, GFP_KERNEL);
+               if (!seg) {
+                       printk(KERN_ERR "ib_umad_write: "
+                              "mem alloc failed for length %d\n",
+                              sizeof(struct ib_mad_multipacket_seg) +
+                              max_seg_payload);
+                       ret = -ENOMEM;
+                       goto err_msg;
+               }
+
+               if (copy_from_user(seg->data, buf, seg_payload)) {
+                       ret = -EFAULT;
+                       goto err_msg;
+               }
+               ib_append_to_multipacket_mad(packet->msg, seg);
+               buf += seg_payload;
+               length -= seg_payload;
        }
 
        /*

-- 
MST
_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to