Hi Shirley,

I have created the following patch based on your patch series such that
it touches less of the previous code and is thus smaller. Please review
it and see if you think we can make any use of it.


Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib.h       2008-02-06 
18:57:34.847708000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib.h    2008-02-06 
19:22:02.372113000 +0200
@@ -56,11 +56,11 @@
 /* constants */
 
 enum {
-       IPOIB_PACKET_SIZE         = 2048,
-       IPOIB_BUF_SIZE            = IPOIB_PACKET_SIZE + IB_GRH_BYTES,
-
        IPOIB_ENCAP_LEN           = 4,
 
+       IPOIB_UD_HEAD_SIZE        = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
+       IPOIB_UD_RX_SG            = 2, /* for 4K MTU */
+
        IPOIB_CM_MTU              = 0x10000 - 0x10, /* padding to align header 
to 16 */
        IPOIB_CM_BUF_SIZE         = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
        IPOIB_CM_HEAD_SIZE        = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
@@ -141,9 +141,9 @@ struct ipoib_mcast {
        struct net_device *dev;
 };
 
-struct ipoib_rx_buf {
+struct ipoib_sg_rx_buf {
        struct sk_buff *skb;
-       u64             mapping;
+       u64             mapping[IPOIB_UD_RX_SG];
 };
 
 struct ipoib_tx_buf {
@@ -337,7 +337,7 @@ struct ipoib_dev_priv {
 
        struct net_device      *dev;
        struct ib_recv_wr       rx_wr_draft[UD_POST_RCV_COUNT];
-       struct ib_sge           sglist_draft[UD_POST_RCV_COUNT];
+       struct ib_sge           sglist_draft[UD_POST_RCV_COUNT][IPOIB_UD_RX_SG];
        unsigned int            rx_outst;
 
        struct napi_struct napi;
@@ -378,7 +378,7 @@ struct ipoib_dev_priv {
        unsigned int admin_mtu;
        unsigned int mcast_mtu;
 
-       struct ipoib_rx_buf *rx_ring;
+       struct ipoib_sg_rx_buf *rx_ring;
 
        spinlock_t           tx_lock;
        struct ipoib_tx_buf *tx_ring;
@@ -412,6 +412,7 @@ struct ipoib_dev_priv {
        struct ipoib_ethtool_st etool;
        struct timer_list poll_timer;
        struct ib_ah *own_ah;
+       int max_ib_mtu;
 };
 
 struct ipoib_ah {
@@ -452,6 +453,28 @@ struct ipoib_neigh {
        struct list_head    list;
 };
 
+#define IPOIB_UD_MTU(ib_mtu)           (ib_mtu - IPOIB_ENCAP_LEN)
+#define IPOIB_UD_BUF_SIZE(ib_mtu)      (ib_mtu + IB_GRH_BYTES)
+
+static inline int ipoib_ud_need_sg(int ib_mtu)
+{
+       return (IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE) ? 1 : 0;
+}
+
+static inline void ipoib_sg_dma_unmap_rx(struct ipoib_dev_priv *priv,
+                                        u64 mapping[IPOIB_UD_RX_SG])
+{
+       if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+               ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE,
+                                   DMA_FROM_DEVICE);
+               ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE,
+                                 DMA_FROM_DEVICE);
+       } else
+               ib_dma_unmap_single(priv->ca, mapping[0],
+                                   IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
+                                   DMA_FROM_DEVICE);
+}
+
 /*
  * We stash a pointer to our private neighbour information after our
  * hardware address in neigh->ha.  The ALIGN() expression here makes
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c    2008-02-06 
18:57:34.007682000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c 2008-02-06 
19:23:08.903503000 +0200
@@ -96,14 +96,35 @@ static void clean_pending_receives(struc
 
        for (i = 0; i < priv->rx_outst; ++i) {
                id = priv->rx_wr_draft[i].wr_id & ~IPOIB_OP_RECV;
-               ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
-                                            IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+               ipoib_sg_dma_unmap_rx(priv, priv->rx_ring[i].mapping);
                dev_kfree_skb_any(priv->rx_ring[id].skb);
                priv->rx_ring[id].skb = NULL;
        }
        priv->rx_outst = 0;
 }
 
+static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,
+                                  struct sk_buff *skb, unsigned int length)
+{
+       unsigned int size;
+
+       if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+               skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
+
+               /* put header into skb */
+               size = min(length, (unsigned)IPOIB_UD_HEAD_SIZE);
+               __skb_put(skb, size);
+
+               length -= size;
+
+               frag->size = length;
+               skb->data_len += length;
+               skb->len += length;
+               skb->truesize += length;
+       } else
+               skb_put(skb, length);
+}
+
 static int ipoib_ib_post_receive(struct net_device *dev, int id)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -111,7 +132,9 @@ static int ipoib_ib_post_receive(struct 
        int ret = 0;
        int i = priv->rx_outst;
 
-       priv->sglist_draft[i].addr = priv->rx_ring[id].mapping;
+       priv->sglist_draft[i][0].addr = priv->rx_ring[id].mapping[0];
+       priv->sglist_draft[i][1].addr = priv->rx_ring[id].mapping[1];
+
        priv->rx_wr_draft[i].wr_id = id | IPOIB_OP_RECV;
        if (++priv->rx_outst == UD_POST_RCV_COUNT) {
                ret = ib_post_recv(priv->qp, priv->rx_wr_draft, &bad_wr);
@@ -120,8 +143,8 @@ static int ipoib_ib_post_receive(struct 
                        ipoib_warn(priv, "receive failed for buf %d (%d)\n", 
id, ret);
                        while (bad_wr) {
                                id = bad_wr->wr_id & ~IPOIB_OP_RECV;
-                               ib_dma_unmap_single(priv->ca, 
priv->rx_ring[id].mapping,
-                                                   IPOIB_BUF_SIZE, 
DMA_FROM_DEVICE);
+                               ipoib_sg_dma_unmap_rx(priv,
+                                                     priv->rx_ring[i].mapping);
                                dev_kfree_skb_any(priv->rx_ring[id].skb);
                                priv->rx_ring[id].skb = NULL;
                        }
@@ -136,11 +159,17 @@ static int ipoib_alloc_rx_skb(struct net
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct sk_buff *skb;
-       u64 addr;
+       int buf_size;
+       u64 *mapping = priv->rx_ring[id].mapping;
+
+       if (ipoib_ud_need_sg(priv->max_ib_mtu))
+               buf_size = IPOIB_UD_HEAD_SIZE;
+       else
+               buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
 
-       skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
-       if (!skb)
-               return -ENOMEM;
+       skb = dev_alloc_skb(buf_size + 4);
+       if (unlikely(!skb))
+               return -ENOMEM;
 
        /*
         * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
@@ -149,17 +178,33 @@ static int ipoib_alloc_rx_skb(struct net
         */
        skb_reserve(skb, 4);
 
-       addr = ib_dma_map_single(priv->ca, skb->data, IPOIB_BUF_SIZE,
-                                DMA_FROM_DEVICE);
-       if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
-               dev_kfree_skb_any(skb);
-               return -EIO;
+       mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
+                                      DMA_FROM_DEVICE);
+       if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
+               goto out_free;
+
+       if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+               struct page *page = alloc_page(GFP_ATOMIC);
+
+               if (!page)
+                       goto partial_error;
+
+               skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);
+               mapping[1] = ib_dma_map_page(priv->ca, page, 0, PAGE_SIZE,
+                                            DMA_FROM_DEVICE);
+               if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1])))
+                       goto partial_error;
        }
 
-       priv->rx_ring[id].skb     = skb;
-       priv->rx_ring[id].mapping = addr;
+       priv->rx_ring[id].skb = skb;
+       return 0;
 
-       return 0;
+partial_error:
+       ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE);
+
+out_free:
+       dev_kfree_skb_any(skb);
+       return -ENOMEM;
 }
 
 static int ipoib_ib_post_receives(struct net_device *dev)
@@ -186,7 +231,6 @@ static void ipoib_ib_handle_rx_wc(struct
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
        struct sk_buff *skb;
-       u64 addr;
 
        ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
                       wr_id, wc->status);
@@ -198,15 +242,13 @@ static void ipoib_ib_handle_rx_wc(struct
        }
 
        skb  = priv->rx_ring[wr_id].skb;
-       addr = priv->rx_ring[wr_id].mapping;
 
        if (unlikely(wc->status != IB_WC_SUCCESS)) {
                if (wc->status != IB_WC_WR_FLUSH_ERR)
                        ipoib_warn(priv, "failed recv event "
                                   "(status=%d, wrid=%d vend_err %x)\n",
                                   wc->status, wr_id, wc->vendor_err);
-               ib_dma_unmap_single(priv->ca, addr,
-                                   IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+               ipoib_sg_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
                dev_kfree_skb_any(skb);
                priv->rx_ring[wr_id].skb = NULL;
                return;
@@ -231,9 +273,9 @@ static void ipoib_ib_handle_rx_wc(struct
        ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
                       wc->byte_len, wc->slid);
 
-       ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+       ipoib_sg_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
 
-       skb_put(skb, wc->byte_len);
+       ipoib_ud_skb_put_frags(priv, skb, wc->byte_len);
        skb_pull(skb, IB_GRH_BYTES);
 
        skb->protocol = ((struct ipoib_header *) skb->data)->proto;
@@ -828,15 +870,13 @@ int ipoib_ib_dev_stop(struct net_device 
                         * all our pending work requests.
                         */
                        for (i = 0; i < ipoib_recvq_size; ++i) {
-                               struct ipoib_rx_buf *rx_req;
+                               struct ipoib_sg_rx_buf *rx_req;
 
                                rx_req = &priv->rx_ring[i];
 
                                if (rx_req->skb) {
-                                       ib_dma_unmap_single(priv->ca,
-                                                           rx_req->mapping,
-                                                           IPOIB_BUF_SIZE,
-                                                           DMA_FROM_DEVICE);
+                                       ipoib_sg_dma_unmap_rx(priv,
+                                                             
priv->rx_ring[i].mapping);
                                        dev_kfree_skb_any(rx_req->skb);
                                        rx_req->skb = NULL;
                                }
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c  2008-02-06 
18:57:29.657377000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_main.c       2008-02-06 
19:25:56.553407000 +0200
@@ -193,7 +193,7 @@ static int ipoib_change_mtu(struct net_d
                return 0;
        }
 
-       if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
+       if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
                return -EINVAL;
 
        priv->admin_mtu = new_mtu;
@@ -981,10 +981,6 @@ static void ipoib_setup(struct net_devic
        dev->tx_queue_len        = ipoib_sendq_size * 2;
        dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
 
-       /* MTU will be reset when mcast join happens */
-       dev->mtu                 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
-       priv->mcast_mtu          = priv->admin_mtu = dev->mtu;
-
        memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
 
        netif_carrier_off(dev);
@@ -1130,6 +1126,7 @@ static struct net_device *ipoib_add_port
                                         struct ib_device *hca, u8 port)
 {
        struct ipoib_dev_priv *priv;
+       struct ib_port_attr attr;
        int result = -ENOMEM;
 
        priv = ipoib_intf_alloc(format);
@@ -1140,6 +1137,18 @@ static struct net_device *ipoib_add_port
 
        priv->dev->features |= NETIF_F_HIGHDMA;
 
+       if (!ib_query_port(hca, port, &attr))
+               priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+       else {
+               printk(KERN_WARNING "%s: ib_query_port %d failed\n",
+                      hca->name, port);
+               goto device_init_failed;
+       }
+
+       /* MTU will be reset when mcast join happens */
+       priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
+       priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
+
        result = ib_query_pkey(hca, port, 0, &priv->pkey);
        if (result) {
                printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = 
%d)\n",
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_multicast.c     
2008-02-06 18:56:03.372135000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_multicast.c  2008-02-06 
19:22:02.386120000 +0200
@@ -567,8 +567,7 @@ void ipoib_mcast_join_task(struct work_s
                return;
        }
 
-       priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
-               IPOIB_ENCAP_LEN;
+       priv->mcast_mtu = 
IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
 
        if (!ipoib_cm_admin_enabled(dev))
                dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_verbs.c 2008-02-06 
18:57:34.010682000 +0200
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c      2008-02-06 
19:22:02.373118000 +0200
@@ -151,7 +151,7 @@ int ipoib_transport_dev_init(struct net_
                        .max_send_wr  = ipoib_sendq_size,
                        .max_recv_wr  = ipoib_recvq_size,
                        .max_send_sge = dev->features & NETIF_F_SG ? 
MAX_SKB_FRAGS + 1 : 1,
-                       .max_recv_sge = 1
+                       .max_recv_sge = IPOIB_UD_RX_SG
                },
                .sq_sig_type = IB_SIGNAL_REQ_WR,
                .qp_type     = IB_QPT_UD,
@@ -227,16 +227,27 @@ int ipoib_transport_dev_init(struct net_
        priv->tx_wr.send_flags  = IB_SEND_SIGNALED;
 
        for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
-               priv->sglist_draft[i].length = IPOIB_BUF_SIZE;
-               priv->sglist_draft[i].lkey = priv->mr->lkey;
-
-               priv->rx_wr_draft[i].sg_list = &priv->sglist_draft[i];
-               priv->rx_wr_draft[i].num_sge = 1;
+               priv->sglist_draft[i][0].lkey = priv->mr->lkey;
+               priv->sglist_draft[i][1].lkey = priv->mr->lkey;
+               priv->rx_wr_draft[i].sg_list = &priv->sglist_draft[i][0];
                if (i < UD_POST_RCV_COUNT - 1)
                        priv->rx_wr_draft[i].next = &priv->rx_wr_draft[i + 1];
        }
        priv->rx_wr_draft[i].next = NULL;
 
+       if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+               for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
+                       priv->sglist_draft[i][0].length = IPOIB_UD_HEAD_SIZE;
+                       priv->sglist_draft[i][1].length = PAGE_SIZE;
+                       priv->rx_wr_draft[i].num_sge = IPOIB_UD_RX_SG;
+               }
+       } else {
+               for (i = 0; i < UD_POST_RCV_COUNT; ++i) {
+                       priv->sglist_draft[i][0].length = 
IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+                       priv->rx_wr_draft[i].num_sge = 1;
+               }
+       }
+
        return 0;
 
 out_free_scq:


_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to