Avoid overhead of freeing/reallocating and mapping/unmapping for dma for pages that have not been written to by hardware.
Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]> --- This gives >10% boost in BW for message sizes up to 32K. Please queue for 2.6.21. before: # ./netperf-2.4.2/src/netperf -f M -H 11.4.3.68 -c -C -- -m 32000 TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 11.4.3.68 (11.4.3.68) port 0 AF_INET : demo Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. MBytes /s % S % S us/KB us/KB 87380 16384 32000 10.00 716.23 26.22 23.94 1.430 1.306 after: # ./netperf-2.4.2/src/netperf -f M -H 11.4.3.68 -c -C -- -m 32000 TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 11.4.3.68 (11.4.3.68) port 0 AF_INET : demo Recv Send Send Utilization Service Demand Socket Socket Message Elapsed Send Recv Send Recv Size Size Size Time Throughput local remote local remote bytes bytes bytes secs. MBytes /s % S % S us/KB us/KB 87380 16384 32000 10.00 888.67 24.13 25.08 1.061 1.102 diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 8ee6f06..a23c8e3 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -68,14 +68,14 @@ struct ipoib_cm_id { static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); -static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, +static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, u64 mapping[IPOIB_CM_RX_SG]) { int i; ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE); - for (i = 0; i < IPOIB_CM_RX_SG - 1; ++i) + for (i = 0; i < frags; ++i) ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); } @@ -93,7 +93,8 @@ static int ipoib_cm_post_receive(struct net_device *dev, int id) ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); if (unlikely(ret)) { ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); - ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[id].mapping); + ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, + priv->cm.srq_ring[id].mapping); dev_kfree_skb_any(priv->cm.srq_ring[id].skb); priv->cm.srq_ring[id].skb = NULL; } @@ -101,8 +102,8 @@ static int ipoib_cm_post_receive(struct net_device *dev, int id) return ret; } -static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, - u64 mapping[IPOIB_CM_RX_SG]) +static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, int frags, + u64 mapping[IPOIB_CM_RX_SG]) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; @@ -110,7 +111,7 @@ static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); if (unlikely(!skb)) - return -ENOMEM; + return NULL; /* * IPoIB adds a 4 byte header. So we need 12 more bytes to align the @@ -122,10 +123,10 @@ static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) { dev_kfree_skb_any(skb); - return -EIO; + return NULL; } - for (i = 0; i < IPOIB_CM_RX_SG - 1; i++) { + for (i = 0; i < frags; i++) { struct page *page = alloc_page(GFP_ATOMIC); if (!page) @@ -139,7 +140,7 @@ static int ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, } priv->cm.srq_ring[id].skb = skb; - return 0; + return skb; partial_error: @@ -148,8 +149,8 @@ partial_error: for (; i >= 0; --i) ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE); - dev_kfree_skb_any(skb); - return -ENOMEM; + dev_kfree_skb_any(skb); + return NULL; } static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, @@ -312,7 +313,7 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, } /* Adjust length of skb with fragments to match received data */ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, - unsigned int length) + unsigned int length, struct sk_buff *toskb) { int i, num_frags; unsigned int size; @@ -329,7 +330,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, if (length == 0) { /* don't need this page */ - __free_page(frag->page); + skb_fill_page_desc(toskb, i, frag->page, 0, PAGE_SIZE); --skb_shinfo(skb)->nr_frags; } else { size = min(length, (unsigned) PAGE_SIZE); @@ -347,10 +348,11 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) { struct ipoib_dev_priv *priv = netdev_priv(dev); unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ; - struct sk_buff *skb; + struct sk_buff *skb, *newskb; struct ipoib_cm_rx *p; unsigned long flags; u64 mapping[IPOIB_CM_RX_SG]; + int frags; ipoib_dbg_data(priv, "cm recv completion: id %d, op %d, status: %d\n", wr_id, wc->opcode, wc->status); @@ -386,7 +388,11 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) } } - if (unlikely(ipoib_cm_alloc_rx_skb(dev, wr_id, mapping))) { + frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, + (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; + + newskb = ipoib_cm_alloc_rx_skb(dev, wr_id, frags, mapping); + if (unlikely(!newskb)) { /* * If we can't allocate a new RX buffer, dump * this packet and reuse the old buffer. @@ -396,13 +402,13 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) goto repost; } - ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[wr_id].mapping); - memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, sizeof mapping); + ipoib_cm_dma_unmap_rx(priv, frags, priv->cm.srq_ring[wr_id].mapping); + memcpy(priv->cm.srq_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping); ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", wc->byte_len, wc->slid); - skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len); + skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb); skb->protocol = ((struct ipoib_header *) skb->data)->proto; skb->mac.raw = skb->data; @@ -1196,7 +1202,8 @@ int ipoib_cm_dev_init(struct net_device *dev) priv->cm.rx_wr.num_sge = IPOIB_CM_RX_SG; for (i = 0; i < ipoib_recvq_size; ++i) { - if (ipoib_cm_alloc_rx_skb(dev, i, priv->cm.srq_ring[i].mapping)) { + if (!ipoib_cm_alloc_rx_skb(dev, i, IPOIB_CM_RX_SG - 1, + priv->cm.srq_ring[i].mapping)) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); ipoib_cm_dev_cleanup(dev); return -ENOMEM; @@ -1231,7 +1238,8 @@ void ipoib_cm_dev_cleanup(struct net_device *dev) return; for (i = 0; i < ipoib_recvq_size; ++i) if (priv->cm.srq_ring[i].skb) { - ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[i].mapping); + ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, + priv->cm.srq_ring[i].mapping); dev_kfree_skb_any(priv->cm.srq_ring[i].skb); priv->cm.srq_ring[i].skb = NULL; } -- MST _______________________________________________ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general